|
2 | 2 | # |
3 | 3 | # Copyright (c) 2007-2025, PostgreSQL Global Development Group |
4 | 4 | # |
5 | | -# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl |
| 5 | +# src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl |
6 | 6 | # |
7 | | -# Generate UTF-8 <--> GB18030 code conversion tables from |
8 | | -# "gb-18030-2000.xml", obtained from |
9 | | -# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/ |
| 7 | +# Generate UTF-8 <--> EUC_CN code conversion tables from |
| 8 | +# "gb18030-2022.ucm", obtained from |
| 9 | +# https://github.com/unicode-org/icu/blob/main/icu4c/source/data/mappings/ |
10 | 10 | # |
11 | 11 | # The lines we care about in the source file look like |
12 | | -# <a u="009A" b="81 30 83 36"/> |
13 | | -# where the "u" field is the Unicode code point in hex, |
14 | | -# and the "b" field is the hex byte sequence for GB18030 |
| 12 | +# <UXXXX> \xYY[\xYY...] |n |
| 13 | +# where XXXX is the Unicode code point in hex, |
| 14 | +# and the \xYY... is the hex byte sequence for GB18030, |
| 15 | +# and n is a flag indicating the type of mapping. |
15 | 16 |
|
16 | 17 | use strict; |
17 | 18 | use warnings FATAL => 'all'; |
|
22 | 23 |
|
23 | 24 | # Read the input |
24 | 25 |
|
25 | | -my $in_file = "gb-18030-2000.xml"; |
| 26 | +my $in_file = "gb18030-2022.ucm"; |
26 | 27 |
|
27 | 28 | open(my $in, '<', $in_file) || die("cannot open $in_file"); |
28 | 29 |
|
29 | 30 | my @mapping; |
30 | 31 |
|
31 | 32 | while (<$in>) |
32 | 33 | { |
33 | | - next if (!m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/); |
34 | | - my ($u, $c) = ($1, $2); |
35 | | - $c =~ s/ //g; |
| 34 | + # Mappings may have been removed by commenting out |
| 35 | + next if /^#/; |
| 36 | + |
| 37 | + next if !/^<U([0-9A-Fa-f]+)>\s+ |
| 38 | + ((?:\\x[0-9A-Fa-f]{2})+)\s+ |
| 39 | + \|(\d+)/x; |
| 40 | + my ($u, $c, $flag) = ($1, $2, $3); |
| 41 | + $c =~ s/\\x//g; |
| 42 | + |
| 43 | + # We only want round-trip mappings |
| 44 | + next if ($flag ne '0'); |
| 45 | + |
36 | 46 | my $ucs = hex($u); |
37 | 47 | my $code = hex($c); |
38 | 48 |
|
|
0 commit comments