Revert/fix mb_substitute_character() codepoint checks

nikic · nikic · commit a8a9e93e9a90 · 2017-08-03T21:12:41.000+02:00
The introduced checks did not treat "non-Unicode" encodings correctly,
because they treated the passed integer as encoded in the internal
encoding in that case, while in actuality the substitute character
is always a Unicode codepoint.

Additionally checking the codepoint against the internal encoding
is not correct in any case, because the substitution character must
be mapped in the *target* encoding of the conversion, which does
not necessarily coincide with the internal encoding (the internal
encoding is the default *source* encoding, not *target* encoding).

This reverts the checks back to simple range checks, but in a way
that still resolves #69079: Characters outside the Basic
Multilingual Plane are now accepted and Surrogate Codepoints are
rejected. A distinction between UTF-8 and non-UTF-8 encodings is
not made for surrogate checks (as in the original patch), as
surrogates are always illegal on their own. Specifying a surrogate
as substitution character would only make sense if you could
specify a substitution string with more than one character --
however we do not support that.
diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c
@@ -1978,69 +1978,21 @@ PHP_FUNCTION(mb_detect_order)
 
 static inline int php_mb_check_code_point(long cp)
 {
-	enum mbfl_no_encoding no_enc;
-	char* buf;
-	char buf_len;
-
-	no_enc = MBSTRG(current_internal_encoding)->no_encoding;
-
-	if (php_mb_is_no_encoding_utf8(no_enc)) {
-
-		if ((cp > 0 && 0xd800 > cp) || (cp > 0xdfff && 0x110000 > cp)) {
-			return 1;
-		}
-
+	if (cp <= 0 || cp >= 0x110000) {
+		/* Out of Unicode range */
 		return 0;
-	} else if (php_mb_is_no_encoding_unicode(no_enc)) {
-
-		if (0 > cp || cp > 0x10ffff) {
-			return 0;
-		}
-
-		return 1;
-
-	// backward compatibility
-	} else if (php_mb_is_unsupported_no_encoding(no_enc)) {
-		return cp < 0xffff && cp > 0x0;
 	}
 
-	if (cp < 0x100) {
-		buf_len = 1;
-		buf = (char *) safe_emalloc(buf_len, 1, 1);
-		buf[0] = cp;
-		buf[1] = 0;
-	} else if (cp < 0x10000) {
-		buf_len = 2;
-		buf = (char *) safe_emalloc(buf_len, 1, 1);
-		buf[0] = cp >> 8;
-		buf[1] = cp & 0xff;
-		buf[2] = 0;
-	} else if (cp < 0x1000000) {
-		buf_len = 3;
-		buf = (char *) safe_emalloc(buf_len, 1, 1);
-		buf[0] = cp >> 16;
-		buf[1] = (cp >> 8) & 0xff;
-		buf[2] = cp & 0xff;
-		buf[3] = 0;
-	} else {
-		buf_len = 4;
-		buf = (char *) safe_emalloc(buf_len, 1, 1);
-		buf[0] = cp >> 24;
-		buf[1] = (cp >> 16) & 0xff;
-		buf[2] = (cp >> 8) & 0xff;
-		buf[3] = cp & 0xff;
-		buf[4] = 0;
-	}
-
-	if (php_mb_check_encoding(buf, buf_len, NULL)) {
-		efree(buf);
-
-		return 1;
+	if (cp >= 0xd800 && cp <= 0xdfff) {
+		/* Surrogate code-point. These are never valid on their own and we only allow a single
+		 * substitute character. */
+		return 0;
 	}
 
-	efree(buf);
-
-	return 0;
+	/* As the we do not know the target encoding of the conversion operation that is going to
+	 * use the substitution character, we cannot check whether the codepoint is actually mapped
+	 * in the given encoding at this point. Thus we have to accept everything. */
+	return 1;
 }
 
 /* {{{ proto mixed mb_substitute_character([mixed substchar])
@@ -2081,7 +2033,7 @@ PHP_FUNCTION(mb_substitute_character)
 						MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
 						MBSTRG(current_filter_illegal_substchar) = Z_LVAL_P(arg1);
 					} else {
-						php_error_docref(NULL, E_WARNING, "Unknown character.");
+						php_error_docref(NULL, E_WARNING, "Unknown character");
 						RETURN_FALSE;
 					}
 				}
@@ -2092,7 +2044,7 @@ PHP_FUNCTION(mb_substitute_character)
 					MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
 					MBSTRG(current_filter_illegal_substchar) = Z_LVAL_P(arg1);
 				} else {
-					php_error_docref(NULL, E_WARNING, "Unknown character.");
+					php_error_docref(NULL, E_WARNING, "Unknown character");
 					RETURN_FALSE;
 				}
 				break;
diff --git a/ext/mbstring/tests/bug69079.phpt b/ext/mbstring/tests/bug69079.phpt
@@ -4,11 +4,32 @@ Bug #69079 (enhancement for mb_substitute_character)
 <?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
 --FILE--
 <?php
+
 mb_internal_encoding('UTF-8');
-var_dump(mb_substitute_character(0x1f600));
+var_dump(mb_substitute_character(0x1F600));
+var_dump(bin2hex(mb_scrub("\xff"))); 
+mb_substitute_character(0x3f); // Reset to '?', as the next call will fail
+var_dump(mb_substitute_character(0xD800)); // Surrogate (illegal)
+var_dump(bin2hex(mb_scrub("\xff")));
+
 mb_internal_encoding('EUC-JP-2004');
-var_dump(mb_substitute_character(0x8fa1ef));
+
+mb_substitute_character(0x63); // Reset to '?', as the next call will fail
+mb_substitute_character(0x8fa1ef); // EUC-JP-2004 encoding of U+50AA (illegal)
+var_dump(bin2hex(mb_scrub("\x8d")));
+
+mb_substitute_character(0x50aa);
+var_dump(bin2hex(mb_scrub("\x8d")));
+
 ?>
---EXPECT--
+--EXPECTF--
 bool(true)
-bool(true)
+string(8) "f09f9880"
+
+Warning: mb_substitute_character(): Unknown character in %s on line %d
+bool(false)
+string(2) "3f"
+
+Warning: mb_substitute_character(): Unknown character in %s on line %d
+string(2) "63"
+string(6) "8fa1ef"
diff --git a/ext/mbstring/tests/bug69086.phpt b/ext/mbstring/tests/bug69086.phpt
@@ -8,10 +8,7 @@ mb_substitute_character(0xfffd);
 var_dump("?" === mb_convert_encoding("\x80", "Shift_JIS", "EUC-JP"));
 mb_internal_encoding("UCS-4BE");
 var_dump("\x00\x00\xff\xfd" === mb_convert_encoding("\x80", "UCS-4BE", "UTF-8"));
-mb_substitute_character(0xd800);
-var_dump("\x00\x00\x00\x3f" === mb_convert_encoding("\x80", "UCS-4BE", "UTF-8"));
 ?>
 --EXPECT--
 bool(true)
 bool(true)
-bool(true)
diff --git a/ext/mbstring/tests/mb_chr.phpt b/ext/mbstring/tests/mb_chr.phpt
@@ -15,9 +15,8 @@ mb_substitute_character(0xfffd);
 var_dump(
     "\u{fffd}" === mb_chr(0xd800, "UTF-8")
 );
-mb_substitute_character(0xd800);
 var_dump(
-    "?" === mb_chr(0xd800, "UTF-8")
+    "\u{fffd}" === mb_chr(0xd800, "UTF-8")
 );
 
 mb_internal_encoding("EUC-JP");
@@ -43,15 +42,15 @@ bool(true)
 bool(true)
 bool(true)
 
-Warning: mb_chr(): Unknown encoding "typo" in %s on line 26
+Warning: mb_chr(): Unknown encoding "typo" in %s on line %d
 
-Warning: mb_chr(): Unsupported encoding "pass" in %s on line 27
+Warning: mb_chr(): Unsupported encoding "pass" in %s on line %d
 
-Warning: mb_chr(): Unsupported encoding "jis" in %s on line 28
+Warning: mb_chr(): Unsupported encoding "jis" in %s on line %d
 
-Warning: mb_chr(): Unsupported encoding "cp50222" in %s on line 29
+Warning: mb_chr(): Unsupported encoding "cp50222" in %s on line %d
 
-Warning: mb_chr(): Unsupported encoding "utf-7" in %s on line 30
+Warning: mb_chr(): Unsupported encoding "utf-7" in %s on line %d
 bool(false)
 bool(false)
 bool(false)
diff --git a/ext/mbstring/tests/mb_substitute_character_basic.phpt b/ext/mbstring/tests/mb_substitute_character_basic.phpt
@@ -38,6 +38,6 @@ int(1234)
 bool(true)
 string(4) "none"
 
-Warning: mb_substitute_character(): Unknown character. in %s on line %d
+Warning: mb_substitute_character(): Unknown character in %s on line %d
 bool(false)
 ===DONE===
diff --git a/ext/mbstring/tests/mb_substitute_character_variation1.phpt b/ext/mbstring/tests/mb_substitute_character_variation1.phpt
@@ -123,7 +123,7 @@ fclose($fp);
 *** Testing mb_substitute_character() : usage variation ***
 
 --int 0--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
 bool(false)
 
 --int 1--
@@ -133,30 +133,30 @@ bool(true)
 bool(true)
 
 --int -12345--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
 bool(false)
 
 --float 10.5--
 bool(true)
 
 --float -10.5--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
 bool(false)
 
 --float 12.3456789000e10--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
 bool(false)
 
 --float -12.3456789000e10--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
 bool(false)
 
 --float .5--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
 bool(false)
 
 --empty array--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
 bool(false)
 
 --int indexed array--
@@ -169,25 +169,25 @@ bool(true)
 bool(true)
 
 --uppercase NULL--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
 bool(false)
 
 --lowercase null--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
 bool(false)
 
 --lowercase true--
 bool(true)
 
 --lowercase false--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
 bool(false)
 
 --uppercase TRUE--
 bool(true)
 
 --uppercase FALSE--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
 bool(false)
 
 --empty string DQ--
@@ -197,19 +197,19 @@ bool(true)
 bool(true)
 
 --string DQ--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
 bool(false)
 
 --string SQ--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
 bool(false)
 
 --mixed case string--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
 bool(false)
 
 --heredoc--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
 bool(false)
 
 --instance of classWithToString--
@@ -221,11 +221,11 @@ Error: 8 - Object of class classWithoutToString could not be converted to int, %
 bool(true)
 
 --undefined var--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
 bool(false)
 
 --unset var--
-Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
+Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
 bool(false)
 ===DONE===