Rework output format of pg_ndistinctHEAD master

The existing format of pg_ndistinct uses a single-object JSON structure where each key is itself a comma-separated list of attnums, like: {"3, 4": 11, "3, 6": 11, "4, 6": 11, "3, 4, 6": 11} While this is a very compact format, it is confusing to read and it is difficult to manipulate the values within the object. The new output format introduced in this commit is an array of objects, with: - A key named "attributes", that contains an array of attribute numbers. - A key named "ndistinct", represented as an integer. The values use the same underlying type as previously when printed, with a new output format that shows now as follows: [{"ndistinct": 11, "attributes": [3,4]}, {"ndistinct": 11, "attributes": [3,6]}, {"ndistinct": 11, "attributes": [4,6]}, {"ndistinct": 11, "attributes": [3,4,6]}] This new format will become handy for a follow-up set of changes, so as it becomes possible to inject extended statistics rather than require an ANALYZE, like in a dump/restore sequence or after pg_upgrade on a new cluster. This format has been suggested by Tomas Vondra. The key names are defined in a new header, to ease with the integration of frontend-specific changes that are still under discussion. (Personal note: I am not specifically wedded to these key names, but if there are better name suggestions for this release, feel free.) The bulk of the changes come from the regression tests, where jsonb_pretty() is now used to make the outputs generated easier to parse. Author: Corey Huinker <[email protected]> Reviewed-by: Jian He <[email protected]> Reviewed-by: Michael Paquier <[email protected]> Discussion: https://postgr.es/m/CADkLM=dpz3KFnqP-dgJ-zvRvtjsa8UZv8wDAQdqho=qN3kX0Zg@mail.gmail.com
author: Michael Paquier 2025-11-17 00:52:20 +0000
committer: Michael Paquier 2025-11-17 00:52:20 +0000
commit: 1f927cce44983ed59a3c1eccc95ad2946ac13b42 (patch)
tree: 45c484135ee3d71f421395ba0be535d4b9f25c32
parent: 32b236644d33f6877a2ba0a8e18e17c985f502ce (diff)
5 files changed, 223 insertions, 37 deletions
diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml
index 106583fb296..a88690150a8 100644
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@@ -1576,12 +1576,42 @@ CREATE STATISTICS stts2 (ndistinct) ON city, state, zip FROM zipcodes;
 
 ANALYZE zipcodes;
 
-SELECT stxkeys AS k, stxdndistinct AS nd
-  FROM pg_statistic_ext join pg_statistic_ext_data on (oid = stxoid)
+SELECT stxkeys AS k, jsonb_pretty(stxdndistinct::text::jsonb) AS nd
+  FROM pg_statistic_ext JOIN pg_statistic_ext_data on (oid = stxoid)
   WHERE stxname = 'stts2';
--[ RECORD 1 ]------------------------------------------------------&zwsp;--
+-[ RECORD 1 ]-------------------
 k  | 1 2 5
-nd | {"1, 2": 33178, "1, 5": 33178, "2, 5": 27435, "1, 2, 5": 33178}
+nd | [                          +
+   |     {                      +
+   |         "ndistinct": 33178,+
+   |         "attributes": [    +
+   |             1,             +
+   |             2              +
+   |         ]                  +
+   |     },                     +
+   |     {                      +
+   |         "ndistinct": 33178,+
+   |         "attributes": [    +
+   |             1,             +
+   |             5              +
+   |         ]                  +
+   |     },                     +
+   |     {                      +
+   |         "ndistinct": 27435,+
+   |         "attributes": [    +
+   |             2,             +
+   |             5              +
+   |         ]                  +
+   |     },                     +
+   |     {                      +
+   |         "ndistinct": 33178,+
+   |         "attributes": [    +
+   |             1,             +
+   |             2,             +
+   |             5              +
+   |         ]                  +
+   |     }                      +
+   | ]
 (1 row)
 </programlisting>
      This indicates that there are three combinations of columns that
diff --git a/src/backend/utils/adt/pg_ndistinct.c b/src/backend/utils/adt/pg_ndistinct.c
index 667ada9c3b4..97efc290ef5 100644
--- a/src/backend/utils/adt/pg_ndistinct.c
+++ b/src/backend/utils/adt/pg_ndistinct.c
@@ -16,6 +16,7 @@
 
 #include "lib/stringinfo.h"
 #include "statistics/extended_stats_internal.h"
+#include "statistics/statistics_format.h"
 #include "utils/fmgrprotos.h"
 
 
@@ -51,26 +52,29 @@ pg_ndistinct_out(PG_FUNCTION_ARGS)
 	StringInfoData str;
 
 	initStringInfo(&str);
-	appendStringInfoChar(&str, '{');
+	appendStringInfoChar(&str, '[');
 
 	for (i = 0; i < ndist->nitems; i++)
 	{
-		int			j;
 		MVNDistinctItem item = ndist->items[i];
 
 		if (i > 0)
 			appendStringInfoString(&str, ", ");
 
-		for (j = 0; j < item.nattributes; j++)
-		{
-			AttrNumber	attnum = item.attributes[j];
+		if (item.nattributes <= 0)
+			elog(ERROR, "invalid zero-length attribute array in MVNDistinct");
 
-			appendStringInfo(&str, "%s%d", (j == 0) ? "\"" : ", ", attnum);
-		}
-		appendStringInfo(&str, "\": %d", (int) item.ndistinct);
+		appendStringInfo(&str, "{\"" PG_NDISTINCT_KEY_ATTRIBUTES "\": [%d",
+						 item.attributes[0]);
+
+		for (int j = 1; j < item.nattributes; j++)
+			appendStringInfo(&str, ", %d", item.attributes[j]);
+
+		appendStringInfo(&str, "], \"" PG_NDISTINCT_KEY_NDISTINCT "\": %d}",
+						 (int) item.ndistinct);
 	}
 
-	appendStringInfoChar(&str, '}');
+	appendStringInfoChar(&str, ']');
 
 	PG_RETURN_CSTRING(str.data);
 }
diff --git a/src/include/statistics/statistics_format.h b/src/include/statistics/statistics_format.h
new file mode 100644
index 00000000000..fb0ae58ff3b
--- /dev/null
+++ b/src/include/statistics/statistics_format.h
@@ -0,0 +1,32 @@
+/*-------------------------------------------------------------------------
+ *
+ * statistics_format.h
+ *	  Data related to the format of extended statistics, usable by both
+ *	  frontend and backend code.
+ *
+ * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/include/statistics/statistics_format.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef STATISTICS_FORMAT_H
+#define STATISTICS_FORMAT_H
+
+/* ----------
+ * pg_ndistinct in human-readable format is a JSON array made of elements with
+ * a predefined set of keys, like:
+ *
+ * [{"ndistinct": 11, "attributes": [3,4]},
+ *  {"ndistinct": 11, "attributes": [3,6]},
+ *  {"ndistinct": 11, "attributes": [4,6]},
+ *  {"ndistinct": 11, "attributes": [3,4,6]},
+ *  ... ]
+ * ----------
+ */
+#define PG_NDISTINCT_KEY_ATTRIBUTES	"attributes"
+#define PG_NDISTINCT_KEY_NDISTINCT	"ndistinct"
+
+#endif							/* STATISTICS_FORMAT_H */
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index 495a1b35018..e9379afe39e 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -196,7 +196,7 @@ Statistics objects:
     "public.ab1_a_b_stats" ON a, b FROM ab1; STATISTICS 0
 
 ANALYZE ab1;
-SELECT stxname, stxdndistinct, stxddependencies, stxdmcv, stxdinherit
+SELECT stxname, jsonb_pretty(d.stxdndistinct::text::jsonb) AS stxdndistinct, stxddependencies, stxdmcv, stxdinherit
   FROM pg_statistic_ext s LEFT JOIN pg_statistic_ext_data d ON (d.stxoid = s.oid)
  WHERE s.stxname = 'ab1_a_b_stats';
     stxname    | stxdndistinct | stxddependencies | stxdmcv | stxdinherit 
@@ -476,13 +476,43 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (
 -- correct command
 CREATE STATISTICS s10 ON a, b, c FROM ndistinct;
 ANALYZE ndistinct;
-SELECT s.stxkind, d.stxdndistinct
+SELECT s.stxkind, jsonb_pretty(d.stxdndistinct::text::jsonb) AS stxdndistinct
   FROM pg_statistic_ext s, pg_statistic_ext_data d
  WHERE s.stxrelid = 'ndistinct'::regclass
    AND d.stxoid = s.oid;
- stxkind |                    stxdndistinct                    
----------+-----------------------------------------------------
- {d,f,m} | {"3, 4": 11, "3, 6": 11, "4, 6": 11, "3, 4, 6": 11}
+ stxkind |      stxdndistinct       
+---------+--------------------------
+ {d,f,m} | [                       +
+         |     {                   +
+         |         "ndistinct": 11,+
+         |         "attributes": [ +
+         |             3,          +
+         |             4           +
+         |         ]               +
+         |     },                  +
+         |     {                   +
+         |         "ndistinct": 11,+
+         |         "attributes": [ +
+         |             3,          +
+         |             6           +
+         |         ]               +
+         |     },                  +
+         |     {                   +
+         |         "ndistinct": 11,+
+         |         "attributes": [ +
+         |             4,          +
+         |             6           +
+         |         ]               +
+         |     },                  +
+         |     {                   +
+         |         "ndistinct": 11,+
+         |         "attributes": [ +
+         |             3,          +
+         |             4,          +
+         |             6           +
+         |         ]               +
+         |     }                   +
+         | ]
 (1 row)
 
 -- minor improvement, make sure the ctid does not break the matching
@@ -558,13 +588,43 @@ INSERT INTO ndistinct (a, b, c, filler1)
             mod(i,23) || ' dollars and zero cents'
        FROM generate_series(1,1000) s(i);
 ANALYZE ndistinct;
-SELECT s.stxkind, d.stxdndistinct
+SELECT s.stxkind, jsonb_pretty(d.stxdndistinct::text::jsonb) AS stxdndistinct
   FROM pg_statistic_ext s, pg_statistic_ext_data d
  WHERE s.stxrelid = 'ndistinct'::regclass
    AND d.stxoid = s.oid;
- stxkind |                      stxdndistinct                       
----------+----------------------------------------------------------
- {d,f,m} | {"3, 4": 221, "3, 6": 247, "4, 6": 323, "3, 4, 6": 1000}
+ stxkind |       stxdndistinct        
+---------+----------------------------
+ {d,f,m} | [                         +
+         |     {                     +
+         |         "ndistinct": 221, +
+         |         "attributes": [   +
+         |             3,            +
+         |             4             +
+         |         ]                 +
+         |     },                    +
+         |     {                     +
+         |         "ndistinct": 247, +
+         |         "attributes": [   +
+         |             3,            +
+         |             6             +
+         |         ]                 +
+         |     },                    +
+         |     {                     +
+         |         "ndistinct": 323, +
+         |         "attributes": [   +
+         |             4,            +
+         |             6             +
+         |         ]                 +
+         |     },                    +
+         |     {                     +
+         |         "ndistinct": 1000,+
+         |         "attributes": [   +
+         |             3,            +
+         |             4,            +
+         |             6             +
+         |         ]                 +
+         |     }                     +
+         | ]
 (1 row)
 
 -- correct estimates
@@ -623,7 +683,7 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (
 (1 row)
 
 DROP STATISTICS s10;
-SELECT s.stxkind, d.stxdndistinct
+SELECT s.stxkind, jsonb_pretty(d.stxdndistinct::text::jsonb) AS stxdndistinct
   FROM pg_statistic_ext s, pg_statistic_ext_data d
  WHERE s.stxrelid = 'ndistinct'::regclass
    AND d.stxoid = s.oid;
@@ -707,13 +767,43 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (
 
 CREATE STATISTICS s10 (ndistinct) ON (a+1), (b+100), (2*c) FROM ndistinct;
 ANALYZE ndistinct;
-SELECT s.stxkind, d.stxdndistinct
+SELECT s.stxkind, jsonb_pretty(d.stxdndistinct::text::jsonb) AS stxdndistinct
   FROM pg_statistic_ext s, pg_statistic_ext_data d
  WHERE s.stxrelid = 'ndistinct'::regclass
    AND d.stxoid = s.oid;
- stxkind |                           stxdndistinct                           
----------+-------------------------------------------------------------------
- {d,e}   | {"-1, -2": 221, "-1, -3": 247, "-2, -3": 323, "-1, -2, -3": 1000}
+ stxkind |       stxdndistinct        
+---------+----------------------------
+ {d,e}   | [                         +
+         |     {                     +
+         |         "ndistinct": 221, +
+         |         "attributes": [   +
+         |             -1,           +
+         |             -2            +
+         |         ]                 +
+         |     },                    +
+         |     {                     +
+         |         "ndistinct": 247, +
+         |         "attributes": [   +
+         |             -1,           +
+         |             -3            +
+         |         ]                 +
+         |     },                    +
+         |     {                     +
+         |         "ndistinct": 323, +
+         |         "attributes": [   +
+         |             -2,           +
+         |             -3            +
+         |         ]                 +
+         |     },                    +
+         |     {                     +
+         |         "ndistinct": 1000,+
+         |         "attributes": [   +
+         |             -1,           +
+         |             -2,           +
+         |             -3            +
+         |         ]                 +
+         |     }                     +
+         | ]
 (1 row)
 
 SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
@@ -756,13 +846,43 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b
 
 CREATE STATISTICS s10 (ndistinct) ON a, b, (2*c) FROM ndistinct;
 ANALYZE ndistinct;
-SELECT s.stxkind, d.stxdndistinct
+SELECT s.stxkind, jsonb_pretty(d.stxdndistinct::text::jsonb) AS stxdndistinct
   FROM pg_statistic_ext s, pg_statistic_ext_data d
  WHERE s.stxrelid = 'ndistinct'::regclass
    AND d.stxoid = s.oid;
- stxkind |                        stxdndistinct                        
----------+-------------------------------------------------------------
- {d,e}   | {"3, 4": 221, "3, -1": 247, "4, -1": 323, "3, 4, -1": 1000}
+ stxkind |       stxdndistinct        
+---------+----------------------------
+ {d,e}   | [                         +
+         |     {                     +
+         |         "ndistinct": 221, +
+         |         "attributes": [   +
+         |             3,            +
+         |             4             +
+         |         ]                 +
+         |     },                    +
+         |     {                     +
+         |         "ndistinct": 247, +
+         |         "attributes": [   +
+         |             3,            +
+         |             -1            +
+         |         ]                 +
+         |     },                    +
+         |     {                     +
+         |         "ndistinct": 323, +
+         |         "attributes": [   +
+         |             4,            +
+         |             -1            +
+         |         ]                 +
+         |     },                    +
+         |     {                     +
+         |         "ndistinct": 1000,+
+         |         "attributes": [   +
+         |             3,            +
+         |             4,            +
+         |             -1            +
+         |         ]                 +
+         |     }                     +
+         | ]
 (1 row)
 
 SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index fc6f152a072..fc4aee6d839 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -125,7 +125,7 @@ ALTER TABLE ab1 ALTER a SET STATISTICS -1;
 ALTER STATISTICS ab1_a_b_stats SET STATISTICS 0;
 \d ab1
 ANALYZE ab1;
-SELECT stxname, stxdndistinct, stxddependencies, stxdmcv, stxdinherit
+SELECT stxname, jsonb_pretty(d.stxdndistinct::text::jsonb) AS stxdndistinct, stxddependencies, stxdmcv, stxdinherit
   FROM pg_statistic_ext s LEFT JOIN pg_statistic_ext_data d ON (d.stxoid = s.oid)
  WHERE s.stxname = 'ab1_a_b_stats';
 ALTER STATISTICS ab1_a_b_stats SET STATISTICS -1;
@@ -297,7 +297,7 @@ CREATE STATISTICS s10 ON a, b, c FROM ndistinct;
 
 ANALYZE ndistinct;
 
-SELECT s.stxkind, d.stxdndistinct
+SELECT s.stxkind, jsonb_pretty(d.stxdndistinct::text::jsonb) AS stxdndistinct
   FROM pg_statistic_ext s, pg_statistic_ext_data d
  WHERE s.stxrelid = 'ndistinct'::regclass
    AND d.stxoid = s.oid;
@@ -338,7 +338,7 @@ INSERT INTO ndistinct (a, b, c, filler1)
 
 ANALYZE ndistinct;
 
-SELECT s.stxkind, d.stxdndistinct
+SELECT s.stxkind, jsonb_pretty(d.stxdndistinct::text::jsonb) AS stxdndistinct
   FROM pg_statistic_ext s, pg_statistic_ext_data d
  WHERE s.stxrelid = 'ndistinct'::regclass
    AND d.stxoid = s.oid;
@@ -364,7 +364,7 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (
 
 DROP STATISTICS s10;
 
-SELECT s.stxkind, d.stxdndistinct
+SELECT s.stxkind, jsonb_pretty(d.stxdndistinct::text::jsonb) AS stxdndistinct
   FROM pg_statistic_ext s, pg_statistic_ext_data d
  WHERE s.stxrelid = 'ndistinct'::regclass
    AND d.stxoid = s.oid;
@@ -399,7 +399,7 @@ CREATE STATISTICS s10 (ndistinct) ON (a+1), (b+100), (2*c) FROM ndistinct;
 
 ANALYZE ndistinct;
 
-SELECT s.stxkind, d.stxdndistinct
+SELECT s.stxkind, jsonb_pretty(d.stxdndistinct::text::jsonb) AS stxdndistinct
   FROM pg_statistic_ext s, pg_statistic_ext_data d
  WHERE s.stxrelid = 'ndistinct'::regclass
    AND d.stxoid = s.oid;
@@ -423,7 +423,7 @@ CREATE STATISTICS s10 (ndistinct) ON a, b, (2*c) FROM ndistinct;
 
 ANALYZE ndistinct;
 
-SELECT s.stxkind, d.stxdndistinct
+SELECT s.stxkind, jsonb_pretty(d.stxdndistinct::text::jsonb) AS stxdndistinct
   FROM pg_statistic_ext s, pg_statistic_ext_data d
  WHERE s.stxrelid = 'ndistinct'::regclass
    AND d.stxoid = s.oid;
author	Michael Paquier	2025-11-17 00:52:20 +0000
committer	Michael Paquier	2025-11-17 00:52:20 +0000
commit	1f927cce44983ed59a3c1eccc95ad2946ac13b42 (patch)
tree	45c484135ee3d71f421395ba0be535d4b9f25c32
parent	32b236644d33f6877a2ba0a8e18e17c985f502ce (diff)