postgresql-cfbot
diff --git a/‎src/common/unicode/generate-unicode_norm_table.pl‎
Lines changed: 125 additions & 11 deletions b/‎src/common/unicode/generate-unicode_norm_table.pl‎
Lines changed: 125 additions & 11 deletions
@@ -86,7 +86,10 @@
 		class => $class,
 		compat => $compat,
 		decomp => \@decomp_elts,
-		decomp_length => scalar @decomp_elts);
+		decomp_length => scalar @decomp_elts,
+		canonical => [],
+		compatibility => [],
+		decomp_eq => 0);
 	push(@characters, \%char_entry);
 	$character_hash{ hex($code) } = \%char_entry;
 }
@@ -131,15 +134,29 @@
 #define DECOMP_COMPAT		0x20	/* compatibility mapping */
 
 #define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x1F)
+#define DECOMPOSITION_COMPAT_SIZE(x) (UnicodeDecompSizes[(x)->dec_index])
 #define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & (DECOMP_NO_COMPOSE | DECOMP_COMPAT)) != 0)
 #define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0)
 #define DECOMPOSITION_IS_COMPAT(x) (((x)->dec_size_flags & DECOMP_COMPAT) != 0)
 
 HEADER
 
+foreach my $code (sort { $a <=> $b } keys %character_hash)
+{
+	my $entry = $character_hash{$code};
+
+	# Full Canonical Decomposition.
+	$entry->{canonical} =
+	  resolve_decomposition($entry->{decomp}, \%character_hash, 0);
+	# Full Compatibility Decomposition.
+	$entry->{compatibility} =
+	  resolve_decomposition($entry->{decomp}, \%character_hash);
+}
+
 # Sorting the values so that the table is always generated in the same way.
 my @sorted_codes = sort { $a <=> $b } keys %character_hash;
-my @decomp_codepoints;
+my @decomp_codepoints = (0);
+my @compat_sizes = (0);
 
 # Collect all codepoints of the decomposition and remove duplicate sequences.
 # Be sure to sort by number of codepoints, from largest to smallest.
@@ -151,27 +168,51 @@
 	} @sorted_codes)
 {
 	my $entry = $character_hash{$code};
-	my $decomp = $entry->{decomp};
+	my $canonical = $entry->{canonical};
+	my $compatibility = $entry->{compatibility};
+
+	# Canonical and Compatibility have one index for the UnicodeDecompCodepoints
+	# table. The difference is that different Canonical and Compatibility values
+	# are written to the tables sequentially, the size from Canonical is
+	# specified in UnicodeDecompMain, and the size from Compatibility is
+	# specified in a separately created uint8 UnicodeDecompSizes table.
+	#
+	# That is, if we have an index from the UnicodeDecompMain table, the same
+	# index will be suitable for obtaining the size from UnicodeDecompSizes.
+	$entry->{decomp_eq} = arrays_equal($canonical, $compatibility);
+
+	if (!$entry->{decomp_eq})
+	{
+		my $index = scalar @decomp_codepoints;
+		push @decomp_codepoints, @$canonical, @$compatibility;
+
+		$compat_sizes[$index] = scalar @$compatibility;
+		$entry->{decomp_index} = $index;
+
+		next;
+	}
 
 	# Skip those values that will be stored directly in the main table.
-	if ($entry->{decomp_length} == 0
-		|| ($entry->{decomp_length} == 1 && length($decomp->[0]) <= 4))
+	if (@$canonical == 0
+		|| (@$canonical == 1 && length($canonical->[0]) <= 4))
 	{
 		next;
 	}
 
 	# Search for a sequence of decomposition codepoints in the existing data.
 	# If found, we assign a record index; otherwise, we add the sequence to the
 	# end of the existing data.
-	my $index = contains_subarray(\@decomp_codepoints, $decomp);
+	my $index = contains_subarray(\@decomp_codepoints, $canonical);
 
-	if ($index == -1)
+	if ((exists $compat_sizes[$index] && $compat_sizes[$index])
+		|| $index == -1)
 	{
 		$index = scalar @decomp_codepoints;
-		push @decomp_codepoints, @$decomp;
+		push @decomp_codepoints, @$canonical;
 	}
 
 	$entry->{decomp_index} = $index;
+	$compat_sizes[$index] = 0;
 }
 
 my $main_index = 1;
@@ -187,14 +228,15 @@
 	my $class = $char->{class};
 	my $compat = $char->{compat};
 	my $decomp = $char->{decomp};
+	my $canonical = $char->{canonical};
 	my $index = $char->{decomp_index};
 
 	# Decomposition size
 	# Print size of decomposition
 	my $decomp_size = scalar(@$decomp);
 	die if $decomp_size > 0x1F;    # to not overrun bitmask
 
-	my $first_decomp = shift @$decomp;
+	my $first_decomp = $decomp->[0];
 	my $first_num = $first_decomp ? hex($first_decomp) : 0;
 
 	my $flags = "";
@@ -233,13 +275,18 @@
 		# src/common/unicode_norm.c.
 		if (!($flags =~ /DECOMP_COMPAT/ || $flags =~ /DECOMP_NO_COMPOSE/))
 		{
-			$inverse{$first_num}->{ hex($decomp->[0]) } = hex($code);
+			$inverse{$first_num}->{ hex($decomp->[1]) } = hex($code);
 		}
 	}
 
 	my $line;
+	$decomp_size = scalar @$canonical;
 
-	if ($decomp_size == 0)
+	if (!$char->{decomp_eq})
+	{
+		$line = "\t{$class, $decomp_size$flags, $index}";
+	}
+	elsif ($decomp_size == 0)
 	{
 		$line = "\t{$class, 0$flags, 0}";
 	}
@@ -291,6 +338,22 @@
 	push @decomp_lines, ($index + 1) % 8 == 0 ? "\n\t" : " ";
 }
 
+# Сompile a table with sequences of sizes for full compatibility decomposition.
+my @sizes_lines = ("\t");
+
+foreach my $index (0 .. $#compat_sizes)
+{
+	my $size = $compat_sizes[$index] || 0;
+	push @sizes_lines, $size;
+
+	# Skip tabulation or new line is needed after the last value.
+	last if $index == $#compat_sizes;
+
+	push @sizes_lines, ",";
+	# Each line will have 8 codepoints.
+	push @sizes_lines, ($index + 1) % 8 == 0 ? "\n\t" : " ";
+}
+
 # Preparing data for codepoint composition. A situation where two code points
 # need to be converted into one (recomposition of the reordered Unicode string).
 #
@@ -336,6 +399,8 @@
 my $main_string = join "", @decomp_main;
 my $decomp_length = scalar @decomp_codepoints;
 my $decomp_string = join "", @decomp_lines;
+my $sizes_length = scalar @compat_sizes;
+my $sizes_string = join "", @sizes_lines;
 my $inverse_length = scalar @inverse_codes;
 my $inverse_string = "\t" . join ",\n\t", @inverse_codes;
 my $second_length = scalar @inverse_second;
@@ -355,6 +420,14 @@
 $decomp_string
 };
 
+/*
+ * Table of sizes for full compatibility decomposition.
+ */
+static const uint8 UnicodeDecompSizes[$sizes_length] =
+{
+$sizes_string
+};
+
 /*
  * Table of Unicode codepoints for recomposition of the reordered
  * Unicode string.
@@ -497,3 +570,44 @@ sub contains_subarray
 
 	return -1;
 }
+
+sub resolve_decomposition
+{
+	my ($decomps, $index, $type) = @_;
+	my ($res, @map);
+
+	foreach my $cp (@$decomps)
+	{
+		my $entry =
+		  exists $index->{ hex($cp) } ? $index->{ hex($cp) } : undef;
+
+		if (   defined $entry
+			&& scalar @{ $entry->{decomp} }
+			&& (!defined $type || $type == $entry->{compat}))
+		{
+			$res = resolve_decomposition($entry->{decomp}, $index, $type);
+		}
+		else
+		{
+			$res = [$cp];
+		}
+
+		push @map, @$res;
+	}
+
+	return \@map;
+}
+
+sub arrays_equal
+{
+	my ($arr1, $arr2) = @_;
+
+	return 0 if @$arr1 != @$arr2;
+
+	for my $i (0 .. $#$arr1)
+	{
+		return 0 if $arr1->[$i] ne $arr2->[$i];
+	}
+
+	return 1;
+}