Skip to content

Commit a8a9e93

Browse files
committed
Revert/fix mb_substitute_character() codepoint checks
The introduced checks did not treat "non-Unicode" encodings correctly, because they treated the passed integer as encoded in the internal encoding in that case, while in actuality the substitute character is always a Unicode codepoint. Additionally checking the codepoint against the internal encoding is not correct in any case, because the substitution character must be mapped in the *target* encoding of the conversion, which does not necessarily coincide with the internal encoding (the internal encoding is the default *source* encoding, not *target* encoding). This reverts the checks back to simple range checks, but in a way that still resolves #69079: Characters outside the Basic Multilingual Plane are now accepted and Surrogate Codepoints are rejected. A distinction between UTF-8 and non-UTF-8 encodings is not made for surrogate checks (as in the original patch), as surrogates are always illegal on their own. Specifying a surrogate as substitution character would only make sense if you could specify a substitution string with more than one character -- however we do not support that.
1 parent 3557436 commit a8a9e93

File tree

6 files changed

+61
-92
lines changed

6 files changed

+61
-92
lines changed

ext/mbstring/mbstring.c

Lines changed: 12 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1978,69 +1978,21 @@ PHP_FUNCTION(mb_detect_order)
19781978

19791979
static inline int php_mb_check_code_point(long cp)
19801980
{
1981-
enum mbfl_no_encoding no_enc;
1982-
char* buf;
1983-
char buf_len;
1984-
1985-
no_enc = MBSTRG(current_internal_encoding)->no_encoding;
1986-
1987-
if (php_mb_is_no_encoding_utf8(no_enc)) {
1988-
1989-
if ((cp > 0 && 0xd800 > cp) || (cp > 0xdfff && 0x110000 > cp)) {
1990-
return 1;
1991-
}
1992-
1981+
if (cp <= 0 || cp >= 0x110000) {
1982+
/* Out of Unicode range */
19931983
return 0;
1994-
} else if (php_mb_is_no_encoding_unicode(no_enc)) {
1995-
1996-
if (0 > cp || cp > 0x10ffff) {
1997-
return 0;
1998-
}
1999-
2000-
return 1;
2001-
2002-
// backward compatibility
2003-
} else if (php_mb_is_unsupported_no_encoding(no_enc)) {
2004-
return cp < 0xffff && cp > 0x0;
20051984
}
20061985

2007-
if (cp < 0x100) {
2008-
buf_len = 1;
2009-
buf = (char *) safe_emalloc(buf_len, 1, 1);
2010-
buf[0] = cp;
2011-
buf[1] = 0;
2012-
} else if (cp < 0x10000) {
2013-
buf_len = 2;
2014-
buf = (char *) safe_emalloc(buf_len, 1, 1);
2015-
buf[0] = cp >> 8;
2016-
buf[1] = cp & 0xff;
2017-
buf[2] = 0;
2018-
} else if (cp < 0x1000000) {
2019-
buf_len = 3;
2020-
buf = (char *) safe_emalloc(buf_len, 1, 1);
2021-
buf[0] = cp >> 16;
2022-
buf[1] = (cp >> 8) & 0xff;
2023-
buf[2] = cp & 0xff;
2024-
buf[3] = 0;
2025-
} else {
2026-
buf_len = 4;
2027-
buf = (char *) safe_emalloc(buf_len, 1, 1);
2028-
buf[0] = cp >> 24;
2029-
buf[1] = (cp >> 16) & 0xff;
2030-
buf[2] = (cp >> 8) & 0xff;
2031-
buf[3] = cp & 0xff;
2032-
buf[4] = 0;
2033-
}
2034-
2035-
if (php_mb_check_encoding(buf, buf_len, NULL)) {
2036-
efree(buf);
2037-
2038-
return 1;
1986+
if (cp >= 0xd800 && cp <= 0xdfff) {
1987+
/* Surrogate code-point. These are never valid on their own and we only allow a single
1988+
* substitute character. */
1989+
return 0;
20391990
}
20401991

2041-
efree(buf);
2042-
2043-
return 0;
1992+
/* As the we do not know the target encoding of the conversion operation that is going to
1993+
* use the substitution character, we cannot check whether the codepoint is actually mapped
1994+
* in the given encoding at this point. Thus we have to accept everything. */
1995+
return 1;
20441996
}
20451997

20461998
/* {{{ proto mixed mb_substitute_character([mixed substchar])
@@ -2081,7 +2033,7 @@ PHP_FUNCTION(mb_substitute_character)
20812033
MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
20822034
MBSTRG(current_filter_illegal_substchar) = Z_LVAL_P(arg1);
20832035
} else {
2084-
php_error_docref(NULL, E_WARNING, "Unknown character.");
2036+
php_error_docref(NULL, E_WARNING, "Unknown character");
20852037
RETURN_FALSE;
20862038
}
20872039
}
@@ -2092,7 +2044,7 @@ PHP_FUNCTION(mb_substitute_character)
20922044
MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
20932045
MBSTRG(current_filter_illegal_substchar) = Z_LVAL_P(arg1);
20942046
} else {
2095-
php_error_docref(NULL, E_WARNING, "Unknown character.");
2047+
php_error_docref(NULL, E_WARNING, "Unknown character");
20962048
RETURN_FALSE;
20972049
}
20982050
break;

ext/mbstring/tests/bug69079.phpt

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,32 @@ Bug #69079 (enhancement for mb_substitute_character)
44
<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
55
--FILE--
66
<?php
7+
78
mb_internal_encoding('UTF-8');
8-
var_dump(mb_substitute_character(0x1f600));
9+
var_dump(mb_substitute_character(0x1F600));
10+
var_dump(bin2hex(mb_scrub("\xff")));
11+
mb_substitute_character(0x3f); // Reset to '?', as the next call will fail
12+
var_dump(mb_substitute_character(0xD800)); // Surrogate (illegal)
13+
var_dump(bin2hex(mb_scrub("\xff")));
14+
915
mb_internal_encoding('EUC-JP-2004');
10-
var_dump(mb_substitute_character(0x8fa1ef));
16+
17+
mb_substitute_character(0x63); // Reset to '?', as the next call will fail
18+
mb_substitute_character(0x8fa1ef); // EUC-JP-2004 encoding of U+50AA (illegal)
19+
var_dump(bin2hex(mb_scrub("\x8d")));
20+
21+
mb_substitute_character(0x50aa);
22+
var_dump(bin2hex(mb_scrub("\x8d")));
23+
1124
?>
12-
--EXPECT--
25+
--EXPECTF--
1326
bool(true)
14-
bool(true)
27+
string(8) "f09f9880"
28+
29+
Warning: mb_substitute_character(): Unknown character in %s on line %d
30+
bool(false)
31+
string(2) "3f"
32+
33+
Warning: mb_substitute_character(): Unknown character in %s on line %d
34+
string(2) "63"
35+
string(6) "8fa1ef"

ext/mbstring/tests/bug69086.phpt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,7 @@ mb_substitute_character(0xfffd);
88
var_dump("?" === mb_convert_encoding("\x80", "Shift_JIS", "EUC-JP"));
99
mb_internal_encoding("UCS-4BE");
1010
var_dump("\x00\x00\xff\xfd" === mb_convert_encoding("\x80", "UCS-4BE", "UTF-8"));
11-
mb_substitute_character(0xd800);
12-
var_dump("\x00\x00\x00\x3f" === mb_convert_encoding("\x80", "UCS-4BE", "UTF-8"));
1311
?>
1412
--EXPECT--
1513
bool(true)
1614
bool(true)
17-
bool(true)

ext/mbstring/tests/mb_chr.phpt

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,8 @@ mb_substitute_character(0xfffd);
1515
var_dump(
1616
"\u{fffd}" === mb_chr(0xd800, "UTF-8")
1717
);
18-
mb_substitute_character(0xd800);
1918
var_dump(
20-
"?" === mb_chr(0xd800, "UTF-8")
19+
"\u{fffd}" === mb_chr(0xd800, "UTF-8")
2120
);
2221

2322
mb_internal_encoding("EUC-JP");
@@ -43,15 +42,15 @@ bool(true)
4342
bool(true)
4443
bool(true)
4544

46-
Warning: mb_chr(): Unknown encoding "typo" in %s on line 26
45+
Warning: mb_chr(): Unknown encoding "typo" in %s on line %d
4746

48-
Warning: mb_chr(): Unsupported encoding "pass" in %s on line 27
47+
Warning: mb_chr(): Unsupported encoding "pass" in %s on line %d
4948

50-
Warning: mb_chr(): Unsupported encoding "jis" in %s on line 28
49+
Warning: mb_chr(): Unsupported encoding "jis" in %s on line %d
5150

52-
Warning: mb_chr(): Unsupported encoding "cp50222" in %s on line 29
51+
Warning: mb_chr(): Unsupported encoding "cp50222" in %s on line %d
5352

54-
Warning: mb_chr(): Unsupported encoding "utf-7" in %s on line 30
53+
Warning: mb_chr(): Unsupported encoding "utf-7" in %s on line %d
5554
bool(false)
5655
bool(false)
5756
bool(false)

ext/mbstring/tests/mb_substitute_character_basic.phpt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,6 @@ int(1234)
3838
bool(true)
3939
string(4) "none"
4040

41-
Warning: mb_substitute_character(): Unknown character. in %s on line %d
41+
Warning: mb_substitute_character(): Unknown character in %s on line %d
4242
bool(false)
4343
===DONE===

ext/mbstring/tests/mb_substitute_character_variation1.phpt

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ fclose($fp);
123123
*** Testing mb_substitute_character() : usage variation ***
124124

125125
--int 0--
126-
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
126+
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
127127
bool(false)
128128

129129
--int 1--
@@ -133,30 +133,30 @@ bool(true)
133133
bool(true)
134134

135135
--int -12345--
136-
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
136+
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
137137
bool(false)
138138

139139
--float 10.5--
140140
bool(true)
141141

142142
--float -10.5--
143-
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
143+
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
144144
bool(false)
145145

146146
--float 12.3456789000e10--
147-
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
147+
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
148148
bool(false)
149149

150150
--float -12.3456789000e10--
151-
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
151+
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
152152
bool(false)
153153

154154
--float .5--
155-
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
155+
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
156156
bool(false)
157157

158158
--empty array--
159-
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
159+
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
160160
bool(false)
161161

162162
--int indexed array--
@@ -169,25 +169,25 @@ bool(true)
169169
bool(true)
170170

171171
--uppercase NULL--
172-
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
172+
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
173173
bool(false)
174174

175175
--lowercase null--
176-
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
176+
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
177177
bool(false)
178178

179179
--lowercase true--
180180
bool(true)
181181

182182
--lowercase false--
183-
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
183+
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
184184
bool(false)
185185

186186
--uppercase TRUE--
187187
bool(true)
188188

189189
--uppercase FALSE--
190-
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
190+
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
191191
bool(false)
192192

193193
--empty string DQ--
@@ -197,19 +197,19 @@ bool(true)
197197
bool(true)
198198

199199
--string DQ--
200-
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
200+
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
201201
bool(false)
202202

203203
--string SQ--
204-
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
204+
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
205205
bool(false)
206206

207207
--mixed case string--
208-
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
208+
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
209209
bool(false)
210210

211211
--heredoc--
212-
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
212+
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
213213
bool(false)
214214

215215
--instance of classWithToString--
@@ -221,11 +221,11 @@ Error: 8 - Object of class classWithoutToString could not be converted to int, %
221221
bool(true)
222222

223223
--undefined var--
224-
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
224+
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
225225
bool(false)
226226

227227
--unset var--
228-
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
228+
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
229229
bool(false)
230230
===DONE===
231231

0 commit comments

Comments
 (0)