Skip to content

Commit c4f812e

Browse files
jimjonesbrCommitfest Bot
authored andcommitted
Add xmlcanonicalize function
This patch adds the xmlcanonicalize(xml, boolean) function, which transforms a well-formed XML document into its canonical form according to the W3C Canonical XML 1.1 (C14N 1.1) specification. xmlcanonicalize(doc xml, keep_comments boolean DEFAULT true) RETURNS xml - doc: the XML document to be canonicalized. - keep_comments: whether to preserve XML comments (default: true). This function is implemented using the xmlC14NDocDumpMemory() function from libxml2’s Canonical XML (C14N) module.
1 parent 3853a69 commit c4f812e

File tree

8 files changed

+436
-0
lines changed

8 files changed

+436
-0
lines changed

doc/src/sgml/func/func-xml.sgml

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,56 @@ SELECT xmltext('< foo & bar >');
6161
</para>
6262
</sect3>
6363

64+
<sect3 id="functions-producing-xml-xmlcanonicalize">
65+
<title><literal>xmlcanonicalize</literal></title>
66+
67+
<indexterm>
68+
<primary>xmlcanonicalize</primary>
69+
</indexterm>
70+
71+
<synopsis>
72+
<function>xmlcanonicalize</function> ( <parameter>doc</parameter> <type>xml</type> [, <parameter>keep_comments</parameter> <type>boolean</type> DEFAULT <literal>true</literal>] ) <returnvalue>xml</returnvalue>
73+
74+
</synopsis>
75+
76+
<para>
77+
This function transforms a given XML document into its <ulink url="https://www.w3.org/TR/xml-c14n11/#Terminology">canonical form</ulink>,
78+
as defined by the <ulink url="https://www.w3.org/TR/xml-c14n11/">W3C Canonical XML 1.1 Specification</ulink>, which standardizes the document's
79+
structure and syntax to facilitate comparison and validation.
80+
The <parameter>keep_comments</parameter> parameter controls whether XML comments from the input document are preserved or discarded.
81+
If omitted, it defaults to <literal>true</literal>.
82+
</para>
83+
84+
<para>
85+
Example:
86+
<screen><![CDATA[
87+
SELECT
88+
xmlcanonicalize(
89+
'<foo>
90+
<!-- a comment -->
91+
<bar c="3" b="2" a="1">42</bar>
92+
<empty/>
93+
</foo>'::xml);
94+
xmlcanonicalize
95+
-----------------------------------------------------------------------------
96+
<foo><!-- a comment --><bar a="1" b="2" c="3">42</bar><empty></empty></foo>
97+
(1 row)
98+
99+
SELECT
100+
xmlcanonicalize(
101+
'<foo>
102+
<!-- a comment -->
103+
<bar c="3" b="2" a="1">42</bar>
104+
<empty/>
105+
</foo>'::xml, false);
106+
xmlcanonicalize
107+
-----------------------------------------------------------
108+
<foo><bar a="1" b="2" c="3">42</bar><empty></empty></foo>
109+
(1 row)
110+
]]></screen>
111+
</para>
112+
</sect3>
113+
64114
<sect3 id="functions-producing-xml-xmlcomment">
65115
<title><literal>xmlcomment</literal></title>
66116

src/backend/catalog/system_functions.sql

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,12 @@ CREATE OR REPLACE FUNCTION xpath_exists(text, xml)
268268
IMMUTABLE PARALLEL SAFE STRICT COST 1
269269
RETURN xpath_exists($1, $2, '{}'::text[]);
270270

271+
CREATE OR REPLACE FUNCTION xmlcanonicalize(xml, boolean DEFAULT true)
272+
RETURNS xml
273+
LANGUAGE internal
274+
IMMUTABLE PARALLEL SAFE STRICT
275+
AS 'xmlcanonicalize';
276+
271277
CREATE OR REPLACE FUNCTION pg_sleep_for(interval)
272278
RETURNS void
273279
LANGUAGE sql

src/backend/utils/adt/xml.c

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
#include <libxml/xmlwriter.h>
5959
#include <libxml/xpath.h>
6060
#include <libxml/xpathInternals.h>
61+
#include <libxml/c14n.h>
6162

6263
/*
6364
* We used to check for xmlStructuredErrorContext via a configure test; but
@@ -565,6 +566,86 @@ xmltext(PG_FUNCTION_ARGS)
565566
#endif /* not USE_LIBXML */
566567
}
567568

569+
/*
570+
* Canonicalizes the given XML document according to the W3C Canonical XML 1.1
571+
* specification, using libxml2's xmlC14NDocDumpMemory().
572+
*
573+
* The input XML must be a well-formed document (not a fragment). The
574+
* canonical form is deterministic and useful for digital signatures and
575+
* comparing logically equivalent XML.
576+
*
577+
* The second argument determines whether comments are preserved
578+
* (true) or omitted (false) in the canonicalized output.
579+
*/
580+
Datum xmlcanonicalize(PG_FUNCTION_ARGS)
581+
{
582+
#ifdef USE_LIBXML
583+
xmltype *arg = PG_GETARG_XML_P(0);
584+
bool keep_comments = PG_GETARG_BOOL(1);
585+
text *result;
586+
volatile xmlChar *xmlbuf = NULL;
587+
volatile int nbytes = 0;
588+
volatile xmlDocPtr doc = NULL;
589+
PgXmlErrorContext *xmlerrcxt;
590+
591+
/* Set up XML error context for proper libxml2 error integration */
592+
xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL);
593+
594+
PG_TRY();
595+
{
596+
/* Parse the input as a full XML document */
597+
doc = xml_parse(arg, XMLOPTION_DOCUMENT, false,
598+
GetDatabaseEncoding(), NULL, NULL, (Node *)xmlerrcxt);
599+
600+
/*
601+
* xmlC14NDocDumpMemory arguments:
602+
* - doc: the XML document to canonicalize (already parsed above)
603+
* - nodes: NULL means the entire document is canonicalized
604+
* - mode: 2 selects the Canonical XML 1.1 algorithm (xmlC14NMode enum)
605+
* - inclusive_ns_prefixes: NULL includes all namespaces by default
606+
* - with_comments: determined by keep_comments argument
607+
* - doc_txt_ptr: output buffer receiving the canonicalized XML (xmlbuf)
608+
*
609+
* On success, xmlbuf points to the serialized canonical form, and nbytes
610+
* holds its size.
611+
*/
612+
nbytes = xmlC14NDocDumpMemory(doc,
613+
NULL, /* entire document */
614+
2, /* xmlC14NMode 1.1 */
615+
NULL, /* all namespaces */
616+
keep_comments,
617+
(xmlChar **)&xmlbuf);
618+
619+
if (nbytes < 0 || xmlbuf == NULL || xmlerrcxt->err_occurred)
620+
xml_ereport(xmlerrcxt, ERROR, ERRCODE_INTERNAL_ERROR,
621+
"could not canonicalize XML document");
622+
623+
result = cstring_to_text_with_len((const char *)xmlbuf, nbytes);
624+
}
625+
PG_CATCH();
626+
{
627+
if (doc)
628+
xmlFreeDoc((xmlDocPtr)doc);
629+
if (xmlbuf)
630+
xmlFree((xmlChar *)xmlbuf);
631+
632+
pg_xml_done(xmlerrcxt, true);
633+
PG_RE_THROW();
634+
}
635+
PG_END_TRY();
636+
637+
if (doc)
638+
xmlFreeDoc((xmlDocPtr)doc);
639+
if (xmlbuf)
640+
xmlFree((xmlChar *)xmlbuf);
641+
pg_xml_done(xmlerrcxt, false);
642+
643+
PG_RETURN_XML_P(result);
644+
#else
645+
NO_XML_SUPPORT();
646+
return 0;
647+
#endif /* not USE_LIBXML */
648+
}
568649

569650
/*
570651
* TODO: xmlconcat needs to merge the notations and unparsed entities

src/include/catalog/pg_proc.dat

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9179,6 +9179,9 @@
91799179
{ oid => '3813', descr => 'generate XML text node',
91809180
proname => 'xmltext', prorettype => 'xml', proargtypes => 'text',
91819181
prosrc => 'xmltext' },
9182+
{ oid => '3814', descr => 'generate the canonical form of an XML document',
9183+
proname => 'xmlcanonicalize', prorettype => 'xml', proargtypes => 'xml bool',
9184+
prosrc => 'xmlcanonicalize' },
91829185

91839186
{ oid => '2923', descr => 'map table contents to XML',
91849187
proname => 'table_to_xml', procost => '100', provolatile => 's',

src/test/regress/expected/xml.out

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1881,3 +1881,85 @@ SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j'::char);
18811881
x&lt;P&gt;73&lt;/P&gt;0.42truej
18821882
(1 row)
18831883

1884+
-- xmlserialize: canonical
1885+
CREATE TABLE xmlcanonicalize_test (doc xml);
1886+
INSERT INTO xmlcanonicalize_test VALUES
1887+
('<?xml version="1.0" encoding="ISO-8859-1"?>
1888+
<!DOCTYPE doc SYSTEM "doc.dtd" [
1889+
<!ENTITY val "42">
1890+
<!ATTLIST xyz attr CDATA "default">
1891+
]>
1892+
1893+
<!-- attributes and namespces will be sorted -->
1894+
<foo a:attr="out" b:attr="sorted" attr2="all" attr="I am"
1895+
xmlns:b="http://www.ietf.org"
1896+
xmlns:a="http://www.w3.org"
1897+
xmlns="http://example.org">
1898+
1899+
<!-- Normalization of whitespace in start and end tags -->
1900+
<!-- Elimination of superfluous namespace declarations, as already declared in <foo> -->
1901+
<bar xmlns="" xmlns:a="http://www.w3.org" >&val;</bar >
1902+
1903+
<!-- empty element will be converted to start-end tag pair -->
1904+
<empty/>
1905+
1906+
<!-- text will be transcoded to UTF-8 -->
1907+
<transcode>&#49;</transcode>
1908+
1909+
<!-- whitespace inside tag will be preserved -->
1910+
<whitespace> 321 </whitespace>
1911+
1912+
<!-- empty namespace will be removed of child tag -->
1913+
<emptyns xmlns="" >
1914+
<emptyns_child xmlns=""></emptyns_child>
1915+
</emptyns>
1916+
1917+
<!-- CDATA section will be replaced by its value -->
1918+
<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>
1919+
</foo> <!-- comment outside root element --> ');
1920+
SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test;
1921+
xmlcanonicalize
1922+
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1923+
<!-- attributes and namespces will be sorted --> +
1924+
<foo xmlns="http://example.org" xmlns:a="http://www.w3.org" xmlns:b="http://www.ietf.org" attr="I am" attr2="all" b:attr="sorted" a:attr="out"><!-- Normalization of whitespace in start and end tags --><!-- Elimination of superfluous namespace declarations, as already declared in <foo> --><bar xmlns="">42</bar><!-- empty element will be converted to start-end tag pair --><empty></empty><!-- text will be transcoded to UTF-8 --><transcode>1</transcode><!-- whitespace inside tag will be preserved --><whitespace> 321 </whitespace><!-- empty namespace will be removed of child tag --><emptyns xmlns=""><emptyns_child></emptyns_child></emptyns><!-- CDATA section will be replaced by its value --><compute>value&gt;"0" &amp;&amp; value&lt;"10" ?"valid":"error"</compute></foo>+
1925+
<!-- comment outside root element -->
1926+
(1 row)
1927+
1928+
SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test;
1929+
xmlcanonicalize
1930+
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1931+
<foo xmlns="http://example.org" xmlns:a="http://www.w3.org" xmlns:b="http://www.ietf.org" attr="I am" attr2="all" b:attr="sorted" a:attr="out"><bar xmlns="">42</bar><empty></empty><transcode>1</transcode><whitespace> 321 </whitespace><emptyns xmlns=""><emptyns_child></emptyns_child></emptyns><compute>value&gt;"0" &amp;&amp; value&lt;"10" ?"valid":"error"</compute></foo>
1932+
(1 row)
1933+
1934+
SELECT xmlcanonicalize(doc, true)::text = xmlcanonicalize(doc)::text FROM xmlcanonicalize_test;
1935+
?column?
1936+
----------
1937+
t
1938+
(1 row)
1939+
1940+
SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test;
1941+
xmlcanonicalize
1942+
-----------------
1943+
1944+
(1 row)
1945+
1946+
SELECT xmlcanonicalize(NULL, true);
1947+
xmlcanonicalize
1948+
-----------------
1949+
1950+
(1 row)
1951+
1952+
\set VERBOSITY terse
1953+
SELECT xmlcanonicalize('', true);
1954+
ERROR: invalid XML document
1955+
SELECT xmlcanonicalize(' ', true);
1956+
ERROR: invalid XML document
1957+
SELECT xmlcanonicalize('foo', true);
1958+
ERROR: invalid XML document
1959+
SELECT xmlcanonicalize('');
1960+
ERROR: invalid XML document
1961+
SELECT xmlcanonicalize(' ');
1962+
ERROR: invalid XML document
1963+
SELECT xmlcanonicalize('foo');
1964+
ERROR: invalid XML document
1965+
\set VERBOSITY default

src/test/regress/expected/xml_1.out

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1496,3 +1496,83 @@ ERROR: unsupported XML feature
14961496
LINE 1: SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j':...
14971497
^
14981498
DETAIL: This functionality requires the server to be built with libxml support.
1499+
-- xmlserialize: canonical
1500+
CREATE TABLE xmlcanonicalize_test (doc xml);
1501+
INSERT INTO xmlcanonicalize_test VALUES
1502+
('<?xml version="1.0" encoding="ISO-8859-1"?>
1503+
<!DOCTYPE doc SYSTEM "doc.dtd" [
1504+
<!ENTITY val "42">
1505+
<!ATTLIST xyz attr CDATA "default">
1506+
]>
1507+
1508+
<!-- attributes and namespces will be sorted -->
1509+
<foo a:attr="out" b:attr="sorted" attr2="all" attr="I am"
1510+
xmlns:b="http://www.ietf.org"
1511+
xmlns:a="http://www.w3.org"
1512+
xmlns="http://example.org">
1513+
1514+
<!-- Normalization of whitespace in start and end tags -->
1515+
<!-- Elimination of superfluous namespace declarations, as already declared in <foo> -->
1516+
<bar xmlns="" xmlns:a="http://www.w3.org" >&val;</bar >
1517+
1518+
<!-- empty element will be converted to start-end tag pair -->
1519+
<empty/>
1520+
1521+
<!-- text will be transcoded to UTF-8 -->
1522+
<transcode>&#49;</transcode>
1523+
1524+
<!-- whitespace inside tag will be preserved -->
1525+
<whitespace> 321 </whitespace>
1526+
1527+
<!-- empty namespace will be removed of child tag -->
1528+
<emptyns xmlns="" >
1529+
<emptyns_child xmlns=""></emptyns_child>
1530+
</emptyns>
1531+
1532+
<!-- CDATA section will be replaced by its value -->
1533+
<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>
1534+
</foo> <!-- comment outside root element --> ');
1535+
ERROR: unsupported XML feature
1536+
LINE 2: ('<?xml version="1.0" encoding="ISO-8859-1"?>
1537+
^
1538+
DETAIL: This functionality requires the server to be built with libxml support.
1539+
SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test;
1540+
xmlcanonicalize
1541+
-----------------
1542+
(0 rows)
1543+
1544+
SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test;
1545+
xmlcanonicalize
1546+
-----------------
1547+
(0 rows)
1548+
1549+
SELECT xmlcanonicalize(doc, true)::text = xmlcanonicalize(doc)::text FROM xmlcanonicalize_test;
1550+
?column?
1551+
----------
1552+
(0 rows)
1553+
1554+
SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test;
1555+
xmlcanonicalize
1556+
-----------------
1557+
(0 rows)
1558+
1559+
SELECT xmlcanonicalize(NULL, true);
1560+
xmlcanonicalize
1561+
-----------------
1562+
1563+
(1 row)
1564+
1565+
\set VERBOSITY terse
1566+
SELECT xmlcanonicalize('', true);
1567+
ERROR: unsupported XML feature at character 24
1568+
SELECT xmlcanonicalize(' ', true);
1569+
ERROR: unsupported XML feature at character 24
1570+
SELECT xmlcanonicalize('foo', true);
1571+
ERROR: unsupported XML feature at character 24
1572+
SELECT xmlcanonicalize('');
1573+
ERROR: unsupported XML feature at character 24
1574+
SELECT xmlcanonicalize(' ');
1575+
ERROR: unsupported XML feature at character 24
1576+
SELECT xmlcanonicalize('foo');
1577+
ERROR: unsupported XML feature at character 24
1578+
\set VERBOSITY default

0 commit comments

Comments
 (0)