|
86 | 86 | class => $class, |
87 | 87 | compat => $compat, |
88 | 88 | decomp => \@decomp_elts, |
89 | | - decomp_length => scalar @decomp_elts); |
| 89 | + decomp_length => scalar @decomp_elts, |
| 90 | + canonical => [], |
| 91 | + compatibility => [], |
| 92 | + decomp_eq => 0); |
90 | 93 | push(@characters, \%char_entry); |
91 | 94 | $character_hash{ hex($code) } = \%char_entry; |
92 | 95 | } |
|
131 | 134 | #define DECOMP_COMPAT 0x20 /* compatibility mapping */ |
132 | 135 |
|
133 | 136 | #define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x1F) |
| 137 | +#define DECOMPOSITION_COMPAT_SIZE(x) (UnicodeDecompSizes[(x)->dec_index]) |
134 | 138 | #define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & (DECOMP_NO_COMPOSE | DECOMP_COMPAT)) != 0) |
135 | 139 | #define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0) |
136 | 140 | #define DECOMPOSITION_IS_COMPAT(x) (((x)->dec_size_flags & DECOMP_COMPAT) != 0) |
137 | 141 |
|
138 | 142 | HEADER |
139 | 143 |
|
| 144 | +foreach my $code (sort { $a <=> $b } keys %character_hash) |
| 145 | +{ |
| 146 | + my $entry = $character_hash{$code}; |
| 147 | + |
| 148 | + # Full Canonical Decomposition. |
| 149 | + $entry->{canonical} = |
| 150 | + resolve_decomposition($entry->{decomp}, \%character_hash, 0); |
| 151 | + # Full Compatibility Decomposition. |
| 152 | + $entry->{compatibility} = |
| 153 | + resolve_decomposition($entry->{decomp}, \%character_hash); |
| 154 | +} |
| 155 | + |
140 | 156 | # Sorting the values so that the table is always generated in the same way. |
141 | 157 | my @sorted_codes = sort { $a <=> $b } keys %character_hash; |
142 | | -my @decomp_codepoints; |
| 158 | +my @decomp_codepoints = (0); |
| 159 | +my @compat_sizes = (0); |
143 | 160 |
|
144 | 161 | # Collect all codepoints of the decomposition and remove duplicate sequences. |
145 | 162 | # Be sure to sort by number of codepoints, from largest to smallest. |
|
151 | 168 | } @sorted_codes) |
152 | 169 | { |
153 | 170 | my $entry = $character_hash{$code}; |
154 | | - my $decomp = $entry->{decomp}; |
| 171 | + my $canonical = $entry->{canonical}; |
| 172 | + my $compatibility = $entry->{compatibility}; |
| 173 | + |
| 174 | + # Canonical and Compatibility have one index for the UnicodeDecompCodepoints |
| 175 | + # table. The difference is that different Canonical and Compatibility values |
| 176 | + # are written to the tables sequentially, the size from Canonical is |
| 177 | + # specified in UnicodeDecompMain, and the size from Compatibility is |
| 178 | + # specified in a separately created uint8 UnicodeDecompSizes table. |
| 179 | + # |
| 180 | + # That is, if we have an index from the UnicodeDecompMain table, the same |
| 181 | + # index will be suitable for obtaining the size from UnicodeDecompSizes. |
| 182 | + $entry->{decomp_eq} = arrays_equal($canonical, $compatibility); |
| 183 | + |
| 184 | + if (!$entry->{decomp_eq}) |
| 185 | + { |
| 186 | + my $index = scalar @decomp_codepoints; |
| 187 | + push @decomp_codepoints, @$canonical, @$compatibility; |
| 188 | + |
| 189 | + $compat_sizes[$index] = scalar @$compatibility; |
| 190 | + $entry->{decomp_index} = $index; |
| 191 | + |
| 192 | + next; |
| 193 | + } |
155 | 194 |
|
156 | 195 | # Skip those values that will be stored directly in the main table. |
157 | | - if ($entry->{decomp_length} == 0 |
158 | | - || ($entry->{decomp_length} == 1 && length($decomp->[0]) <= 4)) |
| 196 | + if (@$canonical == 0 |
| 197 | + || (@$canonical == 1 && length($canonical->[0]) <= 4)) |
159 | 198 | { |
160 | 199 | next; |
161 | 200 | } |
162 | 201 |
|
163 | 202 | # Search for a sequence of decomposition codepoints in the existing data. |
164 | 203 | # If found, we assign a record index; otherwise, we add the sequence to the |
165 | 204 | # end of the existing data. |
166 | | - my $index = contains_subarray(\@decomp_codepoints, $decomp); |
| 205 | + my $index = contains_subarray(\@decomp_codepoints, $canonical); |
167 | 206 |
|
168 | | - if ($index == -1) |
| 207 | + if ((exists $compat_sizes[$index] && $compat_sizes[$index]) |
| 208 | + || $index == -1) |
169 | 209 | { |
170 | 210 | $index = scalar @decomp_codepoints; |
171 | | - push @decomp_codepoints, @$decomp; |
| 211 | + push @decomp_codepoints, @$canonical; |
172 | 212 | } |
173 | 213 |
|
174 | 214 | $entry->{decomp_index} = $index; |
| 215 | + $compat_sizes[$index] = 0; |
175 | 216 | } |
176 | 217 |
|
177 | 218 | my $main_index = 1; |
|
187 | 228 | my $class = $char->{class}; |
188 | 229 | my $compat = $char->{compat}; |
189 | 230 | my $decomp = $char->{decomp}; |
| 231 | + my $canonical = $char->{canonical}; |
190 | 232 | my $index = $char->{decomp_index}; |
191 | 233 |
|
192 | 234 | # Decomposition size |
193 | 235 | # Print size of decomposition |
194 | 236 | my $decomp_size = scalar(@$decomp); |
195 | 237 | die if $decomp_size > 0x1F; # to not overrun bitmask |
196 | 238 |
|
197 | | - my $first_decomp = shift @$decomp; |
| 239 | + my $first_decomp = $decomp->[0]; |
198 | 240 | my $first_num = $first_decomp ? hex($first_decomp) : 0; |
199 | 241 |
|
200 | 242 | my $flags = ""; |
|
233 | 275 | # src/common/unicode_norm.c. |
234 | 276 | if (!($flags =~ /DECOMP_COMPAT/ || $flags =~ /DECOMP_NO_COMPOSE/)) |
235 | 277 | { |
236 | | - $inverse{$first_num}->{ hex($decomp->[0]) } = hex($code); |
| 278 | + $inverse{$first_num}->{ hex($decomp->[1]) } = hex($code); |
237 | 279 | } |
238 | 280 | } |
239 | 281 |
|
240 | 282 | my $line; |
| 283 | + $decomp_size = scalar @$canonical; |
241 | 284 |
|
242 | | - if ($decomp_size == 0) |
| 285 | + if (!$char->{decomp_eq}) |
| 286 | + { |
| 287 | + $line = "\t{$class, $decomp_size$flags, $index}"; |
| 288 | + } |
| 289 | + elsif ($decomp_size == 0) |
243 | 290 | { |
244 | 291 | $line = "\t{$class, 0$flags, 0}"; |
245 | 292 | } |
|
291 | 338 | push @decomp_lines, ($index + 1) % 8 == 0 ? "\n\t" : " "; |
292 | 339 | } |
293 | 340 |
|
| 341 | +# Сompile a table with sequences of sizes for full compatibility decomposition. |
| 342 | +my @sizes_lines = ("\t"); |
| 343 | + |
| 344 | +foreach my $index (0 .. $#compat_sizes) |
| 345 | +{ |
| 346 | + my $size = $compat_sizes[$index] || 0; |
| 347 | + push @sizes_lines, $size; |
| 348 | + |
| 349 | + # Skip tabulation or new line is needed after the last value. |
| 350 | + last if $index == $#compat_sizes; |
| 351 | + |
| 352 | + push @sizes_lines, ","; |
| 353 | + # Each line will have 8 codepoints. |
| 354 | + push @sizes_lines, ($index + 1) % 8 == 0 ? "\n\t" : " "; |
| 355 | +} |
| 356 | + |
294 | 357 | # Preparing data for codepoint composition. A situation where two code points |
295 | 358 | # need to be converted into one (recomposition of the reordered Unicode string). |
296 | 359 | # |
|
336 | 399 | my $main_string = join "", @decomp_main; |
337 | 400 | my $decomp_length = scalar @decomp_codepoints; |
338 | 401 | my $decomp_string = join "", @decomp_lines; |
| 402 | +my $sizes_length = scalar @compat_sizes; |
| 403 | +my $sizes_string = join "", @sizes_lines; |
339 | 404 | my $inverse_length = scalar @inverse_codes; |
340 | 405 | my $inverse_string = "\t" . join ",\n\t", @inverse_codes; |
341 | 406 | my $second_length = scalar @inverse_second; |
|
355 | 420 | $decomp_string |
356 | 421 | }; |
357 | 422 |
|
| 423 | +/* |
| 424 | + * Table of sizes for full compatibility decomposition. |
| 425 | + */ |
| 426 | +static const uint8 UnicodeDecompSizes[$sizes_length] = |
| 427 | +{ |
| 428 | +$sizes_string |
| 429 | +}; |
| 430 | +
|
358 | 431 | /* |
359 | 432 | * Table of Unicode codepoints for recomposition of the reordered |
360 | 433 | * Unicode string. |
@@ -497,3 +570,44 @@ sub contains_subarray |
497 | 570 |
|
498 | 571 | return -1; |
499 | 572 | } |
| 573 | + |
| 574 | +sub resolve_decomposition |
| 575 | +{ |
| 576 | + my ($decomps, $index, $type) = @_; |
| 577 | + my ($res, @map); |
| 578 | + |
| 579 | + foreach my $cp (@$decomps) |
| 580 | + { |
| 581 | + my $entry = |
| 582 | + exists $index->{ hex($cp) } ? $index->{ hex($cp) } : undef; |
| 583 | + |
| 584 | + if ( defined $entry |
| 585 | + && scalar @{ $entry->{decomp} } |
| 586 | + && (!defined $type || $type == $entry->{compat})) |
| 587 | + { |
| 588 | + $res = resolve_decomposition($entry->{decomp}, $index, $type); |
| 589 | + } |
| 590 | + else |
| 591 | + { |
| 592 | + $res = [$cp]; |
| 593 | + } |
| 594 | + |
| 595 | + push @map, @$res; |
| 596 | + } |
| 597 | + |
| 598 | + return \@map; |
| 599 | +} |
| 600 | + |
| 601 | +sub arrays_equal |
| 602 | +{ |
| 603 | + my ($arr1, $arr2) = @_; |
| 604 | + |
| 605 | + return 0 if @$arr1 != @$arr2; |
| 606 | + |
| 607 | + for my $i (0 .. $#$arr1) |
| 608 | + { |
| 609 | + return 0 if $arr1->[$i] ne $arr2->[$i]; |
| 610 | + } |
| 611 | + |
| 612 | + return 1; |
| 613 | +} |
0 commit comments