Skip to content

Commit 31063d0

Browse files
author
Commitfest Bot
committed
[CF 5802] v6 - Improve the performance of Unicode Normalization Forms.
This branch was automatically generated by a robot using patches from an email thread registered at: https://commitfest.postgresql.org/patch/5802 The branch will be overwritten each time a new patch version is posted to the thread, and also periodically to check for bitrot caused by changes on the master branch. Patch(es): https://www.postgresql.org/message-id/[email protected] Author(s): Alexander Borisov
2 parents 5310fac + b86cb76 commit 31063d0

File tree

7 files changed

+55763
-12827
lines changed

7 files changed

+55763
-12827
lines changed

src/common/unicode/generate-unicode_case_table.pl

Lines changed: 8 additions & 152 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
use FindBin;
1616
use lib "$FindBin::RealBin/../../tools/";
17+
use GenerateSparseArray;
1718

1819
my $output_path = '.';
1920

@@ -465,11 +466,11 @@ sub get_hash_key
465466

466467
print $OT "\n};\n";
467468

469+
my $case_table_name = "case_map";
468470
my @codepoints = keys %simple;
469-
my $range = make_ranges(\@codepoints, 500);
470-
my @case_map_lines = range_tables($range);
471-
my $case_map_length = scalar @case_map_lines;
472-
my $case_map_table = join "\n", @case_map_lines;
471+
my $range = GenerateSparseArray::make(\@codepoints, 500);
472+
my $case_map_table = GenerateSparseArray::tables($range, $case_table_name,
473+
sub { $simple{ $_[0] }{Index} || 0 });
473474

474475
print $OT <<"EOS";
475476
@@ -478,10 +479,7 @@ sub get_hash_key
478479
* of the following arrays: case_map_lower, case_map_title, case_map_upper,
479480
* case_map_fold.
480481
*/
481-
static const uint16 case_map[$case_map_length] =
482-
{
483482
$case_map_table
484-
};
485483
486484
487485
EOS
@@ -506,12 +504,13 @@ sub get_hash_key
506504
/* Fast path for codepoints < $fastpath_limit */
507505
if (cp < $fastpath_limit)
508506
{
509-
return case_map[cp];
507+
return $case_table_name\[cp];
510508
}
511509
512510
EOS
513511

514-
print $OT join("\n", @{ branch($range, 0, $#$range, 1) });
512+
print $OT GenerateSparseArray::branch_as_text($range, 0, $#$range, 1,
513+
$case_table_name);
515514

516515
print $OT <<"EOS";
517516
@@ -521,146 +520,3 @@ sub get_hash_key
521520
EOS
522521

523522
close $OT;
524-
525-
# The function generates C code with a series of nested if-else conditions
526-
# to search for the matching interval.
527-
sub branch
528-
{
529-
my ($range, $from, $to, $indent) = @_;
530-
my ($idx, $space, $entry, $table, @result);
531-
532-
$idx = ($from + int(($to - $from) / 2));
533-
return \@result unless exists $range->[$idx];
534-
535-
$space = "\t" x $indent;
536-
537-
$entry = $range->[$idx];
538-
539-
# IF state
540-
if ($idx == $from)
541-
{
542-
if ($idx == 0)
543-
{
544-
push @result,
545-
sprintf("%sif (cp >= 0x%04X && cp < 0x%04X)\n%s{",
546-
$space, $entry->{Start}, $entry->{End}, $space);
547-
}
548-
else
549-
{
550-
push @result,
551-
sprintf("%sif (cp < 0x%04X)\n%s{",
552-
$space, $entry->{End}, $space);
553-
}
554-
555-
push @result,
556-
sprintf("%s\treturn case_map[cp - 0x%04X + %d];",
557-
$space, $entry->{Start}, $entry->{Offset});
558-
}
559-
else
560-
{
561-
push @result,
562-
sprintf("%sif (cp < 0x%04X)\n%s{", $space, $entry->{End}, $space);
563-
push @result, @{ branch($range, $from, $idx - 1, $indent + 1) };
564-
}
565-
566-
push @result, $space . "}";
567-
568-
# return now if it's the last range
569-
return \@result if $idx == (scalar @$range) - 1;
570-
571-
# ELSE looks ahead to the next range to avoid adding an
572-
# unnecessary level of branching.
573-
$entry = @$range[ $idx + 1 ];
574-
575-
# ELSE state
576-
push @result,
577-
sprintf("%selse if (cp >= 0x%04X)\n%s{",
578-
$space, $entry->{Start}, $space);
579-
580-
if ($idx == $to)
581-
{
582-
push @result,
583-
sprintf("%s\treturn case_map\[cp - 0x%04X + %d];",
584-
$space, $entry->{Start}, $entry->{Offset});
585-
}
586-
else
587-
{
588-
push @result, @{ branch($range, $idx + 1, $to, $indent + 1) };
589-
}
590-
591-
push @result, $space . "}";
592-
593-
return \@result;
594-
}
595-
596-
# Group numbers into ranges where the difference between neighboring
597-
# elements does not exceed $limit. If the difference is greater, a new
598-
# range is created. This is used to break the sequence into intervals
599-
# where the gaps between numbers are greater than limit.
600-
#
601-
# For example, if there are numbers 1, 2, 3, 5, 6 and limit = 1, then
602-
# there is a difference of 2 between 3 and 5, which is greater than 1,
603-
# so there will be ranges 1-3 and 5-6.
604-
sub make_ranges
605-
{
606-
my ($nums, $limit) = @_;
607-
my ($prev, $start, $total, @sorted, @range);
608-
609-
@sorted = sort { $a <=> $b } @$nums;
610-
611-
die "expecting at least 2 codepoints" if (scalar @sorted < 2);
612-
613-
$start = shift @sorted;
614-
615-
die "expecting first codepoint to start at 0" unless $start == 0;
616-
617-
$prev = $start;
618-
$total = 0;
619-
620-
# append final 'undef' to signal final iteration
621-
push @sorted, undef;
622-
623-
foreach my $curr (@sorted)
624-
{
625-
# if last iteration always append the range
626-
if (!defined($curr) || ($curr - $prev > $limit))
627-
{
628-
push @range,
629-
{
630-
Start => $start,
631-
End => $prev + 1,
632-
Offset => $total
633-
};
634-
$total += $prev + 1 - $start;
635-
$start = $curr;
636-
}
637-
638-
$prev = $curr;
639-
}
640-
641-
return \@range;
642-
}
643-
644-
# The function combines all ranges into the case_map table. Ranges may
645-
# include codepoints without a case mapping at all, in which case the
646-
# entry in case_map should be zero.
647-
sub range_tables
648-
{
649-
my ($range) = @_;
650-
my (@lines, @result);
651-
652-
foreach my $entry (@$range)
653-
{
654-
my $start = $entry->{Start};
655-
my $end = $entry->{End} - 1;
656-
657-
foreach my $cp ($start .. $end)
658-
{
659-
my $idx = sprintf("%d,", ($simple{$cp}{Index} || 0));
660-
$idx .= "\t" if length($idx) < 4;
661-
push @lines, sprintf("\t%s\t\t\t\t\t\t/* U+%06X */", $idx, $cp);
662-
}
663-
}
664-
665-
return @lines;
666-
}

0 commit comments

Comments
 (0)