Merge branch 'PHP-8.2'

alexdowad · alexdowad · commit 3e743e9ba106 · 2022-11-21T14:49:55.000+02:00
* PHP-8.2:
  For UTF-7, flag unnecessary extra trailing byte in Base64 section as error
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf7.c b/ext/mbstring/libmbfl/filters/mbfilter_utf7.c
@@ -530,9 +530,12 @@ static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
 			}
 
 			unsigned char n4 = decode_base64(*p++);
-			if (is_base64_end(n4) || p == e) {
+			if (is_base64_end(n4)) {
 				out = handle_base64_end(n4, &p, out, &base64, n3 & 0x3, &surrogate1);
 				continue;
+			} else if (p == e) {
+				out = handle_base64_end(n4, &p, out, &base64, true, &surrogate1);
+				continue;
 			}
 			unsigned char n5 = decode_base64(*p++);
 			if (is_base64_end(n5) || p == e) {
@@ -552,9 +555,12 @@ static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
 			}
 
 			unsigned char n7 = decode_base64(*p++);
-			if (is_base64_end(n7) || p == e) {
+			if (is_base64_end(n7)) {
 				out = handle_base64_end(n7, &p, out, &base64, n6 & 0xF, &surrogate1);
 				continue;
+			} else if (p == e) {
+				out = handle_base64_end(n7, &p, out, &base64, true, &surrogate1);
+				continue;
 			}
 			unsigned char n8 = decode_base64(*p++);
 			if (is_base64_end(n8)) {
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c b/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c
@@ -558,9 +558,12 @@ static size_t mb_utf7imap_to_wchar(unsigned char **in, size_t *in_len, uint32_t
 			}
 
 			unsigned char n4 = decode_base64(*p++);
-			if (is_base64_end(n4) || p == e) {
+			if (is_base64_end(n4)) {
 				out = handle_base64_end(n4, out, &base64, n3 & 0x3, &surrogate1);
 				continue;
+			} else if (p == e) {
+				out = handle_base64_end(n4, out, &base64, true, &surrogate1);
+				continue;
 			}
 			unsigned char n5 = decode_base64(*p++);
 			if (is_base64_end(n5) || p == e) {
@@ -580,9 +583,12 @@ static size_t mb_utf7imap_to_wchar(unsigned char **in, size_t *in_len, uint32_t
 			}
 
 			unsigned char n7 = decode_base64(*p++);
-			if (is_base64_end(n7) || p == e) {
+			if (is_base64_end(n7)) {
 				out = handle_base64_end(n7, out, &base64, n6 & 0xF, &surrogate1);
 				continue;
+			} else if (p == e) {
+				out = handle_base64_end(n7, out, &base64, true, &surrogate1);
+				continue;
 			}
 			unsigned char n8 = decode_base64(*p++);
 			if (is_base64_end(n8)) {
diff --git a/ext/mbstring/tests/utf7imap_encoding.phpt b/ext/mbstring/tests/utf7imap_encoding.phpt
@@ -221,6 +221,14 @@ convertInvalidString("\x80", "%", "UTF7-IMAP", "UTF-8");
 convertInvalidString("abc&", "abc%", "UTF7-IMAP", "UTF-8"); // The & starts a Base-64 coded section, which is OK... but there's no data in it
 convertInvalidString("&**-", "%*-", "UTF7-IMAP", "UTF-8"); // When we hit the first bad byte in a Base-64 coded section, it drops us back into the default mode, so the following characters are literal
 
+// Try strings where Base64 has an extra trailing byte which is not needed
+convertInvalidString('&RR8I', "\xE4\x94\x9F%", 'UTF7-IMAP', 'UTF-8');
+convertInvalidString('&RR8IAAA', "\xE4\x94\x9F\xE0\xA0\x80%", 'UTF7-IMAP', 'UTF-8');
+
+// It is useless for a Base64 section to only contain a single 'A'
+// (which decodes to only zero bits)
+convertInvalidString("&A", "\x00\x00\x00%", 'UTF7-IMAP', 'UTF-32BE');
+
 echo "Done!\n";
 ?>
 --EXPECT--
diff --git a/ext/mbstring/tests/utf_encodings.phpt b/ext/mbstring/tests/utf_encodings.phpt
@@ -1063,6 +1063,16 @@ testInvalidString('+' . rawEncode("\xD8\x01"), "\x00\x00\x00%", 'UTF-7', 'UTF-32
 testInvalidString('+' . rawEncode("\x01") . '-', "\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
 testInvalidString('+l', "\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
 
+// Base64 section should not have 4 ASCII characters; the first 3 can encode one
+// UTF-16 character, so there is no need for the 4th
+testInvalidString('+RR8I', "\xE4\x94\x9F%", 'UTF-7', 'UTF-8');
+// Likewise with 7 characters
+testInvalidString('+RR8IAAA', "\xE4\x94\x9F\xE0\xA0\x80%", 'UTF-7', 'UTF-8');
+
+// Similarly, it is useless for a Base64 section to only contain a single 'A'
+// (which decodes to only zero bits)
+testInvalidString("+A", "\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
+
 // And then, messed up Base64 encoding
 
 // Bad padding on + section (not zeroes)