Skip to content

Commit b86cb76

Browse files
lexborisovCommitfest Bot
authored andcommitted
Refactoring Unicode Normalization Forms, performance up.
Refactoring the Unicode Normalization Forms function unicode_normalize(). Function performance benchmarks. The benchmark was performed by directly calling the function and passing it data generated from ICU tests. Without patch: Normalization from NFC to NFD with PG: 009.121 Normalization from NFC to NFKD with PG: 009.048 Normalization from NFD to NFC with PG: 014.525 Normalization from NFD to NFKC with PG: 014.380 Whith patch: Normalization from NFC to NFD with PG: 001.580 Normalization from NFC to NFKD with PG: 001.634 Normalization from NFD to NFC with PG: 002.979 Normalization from NFD to NFKC with PG: 003.050 pgbench: The files were sent via pgbench. The files contain all code points that need to be normalized. NFC: Patch: tps = 9701.568161 Without: tps = 6820.828104 NFD: Patch: tps = 2707.155148 Without: tps = 1745.949174 NFKC: Patch: tps = 9893.952804 Without: tps = 6697.358888 NFKD: Patch: tps = 2580.785909 Without: tps = 1521.058417
1 parent 53f68ce commit b86cb76

File tree

3 files changed

+38505
-8076
lines changed

3 files changed

+38505
-8076
lines changed

src/common/unicode/generate-unicode_norm_table.pl

Lines changed: 125 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,10 @@
8686
class => $class,
8787
compat => $compat,
8888
decomp => \@decomp_elts,
89-
decomp_length => scalar @decomp_elts);
89+
decomp_length => scalar @decomp_elts,
90+
canonical => [],
91+
compatibility => [],
92+
decomp_eq => 0);
9093
push(@characters, \%char_entry);
9194
$character_hash{ hex($code) } = \%char_entry;
9295
}
@@ -131,15 +134,29 @@
131134
#define DECOMP_COMPAT 0x20 /* compatibility mapping */
132135
133136
#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x1F)
137+
#define DECOMPOSITION_COMPAT_SIZE(x) (UnicodeDecompSizes[(x)->dec_index])
134138
#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & (DECOMP_NO_COMPOSE | DECOMP_COMPAT)) != 0)
135139
#define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0)
136140
#define DECOMPOSITION_IS_COMPAT(x) (((x)->dec_size_flags & DECOMP_COMPAT) != 0)
137141
138142
HEADER
139143

144+
foreach my $code (sort { $a <=> $b } keys %character_hash)
145+
{
146+
my $entry = $character_hash{$code};
147+
148+
# Full Canonical Decomposition.
149+
$entry->{canonical} =
150+
resolve_decomposition($entry->{decomp}, \%character_hash, 0);
151+
# Full Compatibility Decomposition.
152+
$entry->{compatibility} =
153+
resolve_decomposition($entry->{decomp}, \%character_hash);
154+
}
155+
140156
# Sorting the values so that the table is always generated in the same way.
141157
my @sorted_codes = sort { $a <=> $b } keys %character_hash;
142-
my @decomp_codepoints;
158+
my @decomp_codepoints = (0);
159+
my @compat_sizes = (0);
143160

144161
# Collect all codepoints of the decomposition and remove duplicate sequences.
145162
# Be sure to sort by number of codepoints, from largest to smallest.
@@ -151,27 +168,51 @@
151168
} @sorted_codes)
152169
{
153170
my $entry = $character_hash{$code};
154-
my $decomp = $entry->{decomp};
171+
my $canonical = $entry->{canonical};
172+
my $compatibility = $entry->{compatibility};
173+
174+
# Canonical and Compatibility have one index for the UnicodeDecompCodepoints
175+
# table. The difference is that different Canonical and Compatibility values
176+
# are written to the tables sequentially, the size from Canonical is
177+
# specified in UnicodeDecompMain, and the size from Compatibility is
178+
# specified in a separately created uint8 UnicodeDecompSizes table.
179+
#
180+
# That is, if we have an index from the UnicodeDecompMain table, the same
181+
# index will be suitable for obtaining the size from UnicodeDecompSizes.
182+
$entry->{decomp_eq} = arrays_equal($canonical, $compatibility);
183+
184+
if (!$entry->{decomp_eq})
185+
{
186+
my $index = scalar @decomp_codepoints;
187+
push @decomp_codepoints, @$canonical, @$compatibility;
188+
189+
$compat_sizes[$index] = scalar @$compatibility;
190+
$entry->{decomp_index} = $index;
191+
192+
next;
193+
}
155194

156195
# Skip those values that will be stored directly in the main table.
157-
if ($entry->{decomp_length} == 0
158-
|| ($entry->{decomp_length} == 1 && length($decomp->[0]) <= 4))
196+
if (@$canonical == 0
197+
|| (@$canonical == 1 && length($canonical->[0]) <= 4))
159198
{
160199
next;
161200
}
162201

163202
# Search for a sequence of decomposition codepoints in the existing data.
164203
# If found, we assign a record index; otherwise, we add the sequence to the
165204
# end of the existing data.
166-
my $index = contains_subarray(\@decomp_codepoints, $decomp);
205+
my $index = contains_subarray(\@decomp_codepoints, $canonical);
167206

168-
if ($index == -1)
207+
if ((exists $compat_sizes[$index] && $compat_sizes[$index])
208+
|| $index == -1)
169209
{
170210
$index = scalar @decomp_codepoints;
171-
push @decomp_codepoints, @$decomp;
211+
push @decomp_codepoints, @$canonical;
172212
}
173213

174214
$entry->{decomp_index} = $index;
215+
$compat_sizes[$index] = 0;
175216
}
176217

177218
my $main_index = 1;
@@ -187,14 +228,15 @@
187228
my $class = $char->{class};
188229
my $compat = $char->{compat};
189230
my $decomp = $char->{decomp};
231+
my $canonical = $char->{canonical};
190232
my $index = $char->{decomp_index};
191233

192234
# Decomposition size
193235
# Print size of decomposition
194236
my $decomp_size = scalar(@$decomp);
195237
die if $decomp_size > 0x1F; # to not overrun bitmask
196238

197-
my $first_decomp = shift @$decomp;
239+
my $first_decomp = $decomp->[0];
198240
my $first_num = $first_decomp ? hex($first_decomp) : 0;
199241

200242
my $flags = "";
@@ -233,13 +275,18 @@
233275
# src/common/unicode_norm.c.
234276
if (!($flags =~ /DECOMP_COMPAT/ || $flags =~ /DECOMP_NO_COMPOSE/))
235277
{
236-
$inverse{$first_num}->{ hex($decomp->[0]) } = hex($code);
278+
$inverse{$first_num}->{ hex($decomp->[1]) } = hex($code);
237279
}
238280
}
239281

240282
my $line;
283+
$decomp_size = scalar @$canonical;
241284

242-
if ($decomp_size == 0)
285+
if (!$char->{decomp_eq})
286+
{
287+
$line = "\t{$class, $decomp_size$flags, $index}";
288+
}
289+
elsif ($decomp_size == 0)
243290
{
244291
$line = "\t{$class, 0$flags, 0}";
245292
}
@@ -291,6 +338,22 @@
291338
push @decomp_lines, ($index + 1) % 8 == 0 ? "\n\t" : " ";
292339
}
293340

341+
# Сompile a table with sequences of sizes for full compatibility decomposition.
342+
my @sizes_lines = ("\t");
343+
344+
foreach my $index (0 .. $#compat_sizes)
345+
{
346+
my $size = $compat_sizes[$index] || 0;
347+
push @sizes_lines, $size;
348+
349+
# Skip tabulation or new line is needed after the last value.
350+
last if $index == $#compat_sizes;
351+
352+
push @sizes_lines, ",";
353+
# Each line will have 8 codepoints.
354+
push @sizes_lines, ($index + 1) % 8 == 0 ? "\n\t" : " ";
355+
}
356+
294357
# Preparing data for codepoint composition. A situation where two code points
295358
# need to be converted into one (recomposition of the reordered Unicode string).
296359
#
@@ -336,6 +399,8 @@
336399
my $main_string = join "", @decomp_main;
337400
my $decomp_length = scalar @decomp_codepoints;
338401
my $decomp_string = join "", @decomp_lines;
402+
my $sizes_length = scalar @compat_sizes;
403+
my $sizes_string = join "", @sizes_lines;
339404
my $inverse_length = scalar @inverse_codes;
340405
my $inverse_string = "\t" . join ",\n\t", @inverse_codes;
341406
my $second_length = scalar @inverse_second;
@@ -355,6 +420,14 @@
355420
$decomp_string
356421
};
357422
423+
/*
424+
* Table of sizes for full compatibility decomposition.
425+
*/
426+
static const uint8 UnicodeDecompSizes[$sizes_length] =
427+
{
428+
$sizes_string
429+
};
430+
358431
/*
359432
* Table of Unicode codepoints for recomposition of the reordered
360433
* Unicode string.
@@ -497,3 +570,44 @@ sub contains_subarray
497570

498571
return -1;
499572
}
573+
574+
sub resolve_decomposition
575+
{
576+
my ($decomps, $index, $type) = @_;
577+
my ($res, @map);
578+
579+
foreach my $cp (@$decomps)
580+
{
581+
my $entry =
582+
exists $index->{ hex($cp) } ? $index->{ hex($cp) } : undef;
583+
584+
if ( defined $entry
585+
&& scalar @{ $entry->{decomp} }
586+
&& (!defined $type || $type == $entry->{compat}))
587+
{
588+
$res = resolve_decomposition($entry->{decomp}, $index, $type);
589+
}
590+
else
591+
{
592+
$res = [$cp];
593+
}
594+
595+
push @map, @$res;
596+
}
597+
598+
return \@map;
599+
}
600+
601+
sub arrays_equal
602+
{
603+
my ($arr1, $arr2) = @_;
604+
605+
return 0 if @$arr1 != @$arr2;
606+
607+
for my $i (0 .. $#$arr1)
608+
{
609+
return 0 if $arr1->[$i] ne $arr2->[$i];
610+
}
611+
612+
return 1;
613+
}

0 commit comments

Comments
 (0)