Skip to content

Commit 334ac94

Browse files
committed
[GR-23268] Fixes for test_unicode, part 1
PullRequest: graalpython/1248
2 parents 413da8f + ba464ad commit 334ac94

File tree

12 files changed

+313
-251
lines changed

12 files changed

+313
-251
lines changed

graalpython/com.oracle.graal.python.test/src/tests/unittest_tags/test_unicode.txt

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_comparison
1919
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_concatenation
2020
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_constructor
21+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_constructor_defaults
2122
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_constructor_keyword_args
2223
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_contains
2324
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_count
@@ -36,9 +37,45 @@
3637
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_formatting_huge_precision_c_limits
3738
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_formatting_with_enum
3839
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_hash
40+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_isalpha
3941
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_isascii
42+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_isdecimal
4043
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_islower
44+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_isprintable
4145
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_isspace
4246
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_issue28598_strsubclass_rhs
47+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_istitle
4348
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_isupper
4449
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_iterators
50+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_join_overflow
51+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_literals
52+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_ljust
53+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_lower
54+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_mul
55+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_none_arguments
56+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_partition
57+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_printable_repr
58+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_printing
59+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_replace_id
60+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_replace_overflow
61+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_resize
62+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_rfind
63+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_rindex
64+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_rjust
65+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_rpartition
66+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_rsplit
67+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_slice
68+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_splitlines
69+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_startswith
70+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_startswith_endswith_errors
71+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_strip
72+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_strip_whitespace
73+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_subclass_add
74+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_subscript
75+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_surrogates
76+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_title
77+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_unicode_repr
78+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_upper
79+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_utf8_decode_invalid_sequences
80+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_utf8_decode_valid_sequences
81+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_zfill

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/BuiltinConstructors.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1797,9 +1797,9 @@ public final Object executeWith(VirtualFrame frame, Object arg) {
17971797

17981798
public abstract Object executeWith(VirtualFrame frame, Object strClass, Object arg, Object encoding, Object errors);
17991799

1800-
@Specialization(guards = {"!isNativeClass(strClass)", "isNoValue(arg)", "isNoValue(encoding)", "isNoValue(errors)"})
1800+
@Specialization(guards = {"!isNativeClass(strClass)", "isNoValue(arg)"})
18011801
@SuppressWarnings("unused")
1802-
Object strNoArgs(Object strClass, PNone arg, PNone encoding, PNone errors) {
1802+
Object strNoArgs(Object strClass, PNone arg, Object encoding, Object errors) {
18031803
return asPString(strClass, "");
18041804
}
18051805

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/CodecsModuleBuiltins.java

Lines changed: 33 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@
4545
import static com.oracle.graal.python.runtime.exception.PythonErrorType.UnicodeDecodeError;
4646
import static com.oracle.graal.python.runtime.exception.PythonErrorType.UnicodeEncodeError;
4747

48-
import java.nio.BufferUnderflowException;
4948
import java.nio.ByteBuffer;
5049
import java.nio.CharBuffer;
5150
import java.nio.charset.CharacterCodingException;
@@ -258,6 +257,7 @@ private int getLength(PBytes b) {
258257
}
259258
}
260259

260+
// Encoder for raw_unicode_escape
261261
@Builtin(name = "__truffle_raw_encode", minNumOfPositionalArgs = 1, parameterNames = {"str", "errors"})
262262
@GenerateNodeFactory
263263
public abstract static class RawEncodeNode extends EncodeBaseNode {
@@ -425,14 +425,15 @@ private boolean castToBoolean(VirtualFrame frame, Object object) {
425425
}
426426
}
427427

428+
// Decoder for raw_escape_unicode
428429
@Builtin(name = "__truffle_raw_decode", minNumOfPositionalArgs = 1, parameterNames = {"bytes", "errors"})
429430
@GenerateNodeFactory
430431
abstract static class RawDecodeNode extends EncodeBaseNode {
431432
@Child private GetInternalByteArrayNode toByteArrayNode;
432433

433434
@Specialization
434435
Object decode(PBytesLike bytes, @SuppressWarnings("unused") PNone errors) {
435-
String string = decodeBytes(getBytesBuffer(bytes), "strict");
436+
String string = decodeBytes(getBytes(bytes), "strict");
436437
return factory().createTuple(new Object[]{string, string.length()});
437438
}
438439

@@ -446,60 +447,64 @@ Object decode(PBytesLike bytes, Object errors,
446447
CompilerDirectives.transferToInterpreterAndInvalidate();
447448
throw new IllegalStateException("should not be reached");
448449
}
449-
String string = decodeBytes(getBytesBuffer(bytes), profiledErrors);
450+
String string = decodeBytes(getBytes(bytes), profiledErrors);
450451
return factory().createTuple(new Object[]{string, string.length()});
451452
}
452453

453-
private ByteBuffer getBytesBuffer(PBytesLike bytesLike) {
454+
private byte[] getBytes(PBytesLike bytesLike) {
454455
if (toByteArrayNode == null) {
455456
CompilerDirectives.transferToInterpreterAndInvalidate();
456457
toByteArrayNode = insert(GetInternalByteArrayNodeGen.create());
457458
}
458-
byte[] barr = toByteArrayNode.execute(bytesLike.getSequenceStorage());
459-
return ByteBuffer.wrap(barr, 0, barr.length);
459+
return toByteArrayNode.execute(bytesLike.getSequenceStorage());
460460
}
461461

462462
@TruffleBoundary
463-
String decodeBytes(ByteBuffer bytes, String errors) {
463+
String decodeBytes(byte[] bytes, String errors) {
464464
CodingErrorAction errorAction = convertCodingErrorAction(errors);
465465
try {
466-
ByteBuffer buf = ByteBuffer.allocate(bytes.remaining() * Integer.BYTES);
467-
byte[] hexString = new byte[8];
468-
while (bytes.hasRemaining()) {
469-
int val;
470-
byte b = bytes.get();
471-
if (b == (byte) '\\') {
472-
byte b1 = bytes.get();
466+
ByteBuffer buf = ByteBuffer.allocate(bytes.length * Integer.BYTES);
467+
int i = 0;
468+
while (i < bytes.length) {
469+
byte b = bytes[i];
470+
if (b == (byte) '\\' && i + 1 < bytes.length) {
471+
byte b1 = bytes[i + 1];
472+
int numIndex = i + 2;
473473
if (b1 == (byte) 'u') {
474-
bytes.get(hexString, 0, 4);
475-
val = Integer.parseInt(new String(hexString, 0, 4), 16);
474+
final int count = 4;
475+
if (numIndex + count > bytes.length) {
476+
throw raise(UnicodeDecodeError);
477+
}
478+
buf.putInt(Integer.parseInt(new String(bytes, numIndex, count), 16));
479+
i = numIndex + count;
480+
continue;
476481
} else if (b1 == (byte) 'U') {
477-
bytes.get(hexString, 0, 8);
478-
val = Integer.parseInt(new String(hexString, 0, 8), 16);
479-
} else {
480-
throw new CharacterCodingException();
482+
final int count = 8;
483+
if (numIndex + count > bytes.length) {
484+
throw raise(UnicodeDecodeError);
485+
}
486+
buf.putInt(Integer.parseInt(new String(bytes, numIndex, count), 16));
487+
i = numIndex + count;
488+
continue;
481489
}
482-
} else {
483-
// Bytes that are not an escape sequence are latin-1, which maps to unicode
484-
// codepoints directly
485-
val = b & 0xFF;
486490
}
487-
buf.putInt(val);
491+
// Bytes that are not an escape sequence are latin-1, which maps to unicode
492+
// codepoints directly
493+
buf.putInt(b & 0xFF);
494+
i++;
488495
}
489496
buf.flip();
490497
CharBuffer decoded = UTF32.newDecoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction).decode(buf);
491498
return String.valueOf(decoded);
492-
} catch (CharacterCodingException | NumberFormatException | BufferUnderflowException e) {
499+
} catch (CharacterCodingException | NumberFormatException e) {
493500
throw raise(UnicodeDecodeError, e);
494501
}
495502
}
496503
}
497504

498-
// _codecs.lookup(name)
499505
@Builtin(name = "__truffle_lookup", minNumOfPositionalArgs = 1)
500506
@GenerateNodeFactory
501507
abstract static class CodecsLookupNode extends PythonBuiltinNode {
502-
// This is replaced in the core _codecs.py with the full functionality
503508
@Specialization
504509
Object lookup(String encoding) {
505510
if (CharsetMapping.getCharset(encoding) != null) {
@@ -510,7 +515,6 @@ Object lookup(String encoding) {
510515
}
511516
}
512517

513-
// _codecs.lookup(name)
514518
@Builtin(name = "charmap_build", minNumOfPositionalArgs = 1)
515519
@GenerateNodeFactory
516520
abstract static class CharmapBuildNode extends PythonBuiltinNode {

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/bytes/BytesUtils.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -246,9 +246,10 @@ public static byte[] unicodeEscape(String str) {
246246
// ('\U00xxxxxx')
247247
byte[] bytes = new byte[str.length() * 10];
248248
int j = 0;
249-
for (int i = 0; i < str.length(); i++) {
249+
for (int i = 0; i < str.length();) {
250250
int ch = str.codePointAt(i);
251251
j = unicodeEscape(ch, j, bytes);
252+
i += Character.charCount(ch);
252253
}
253254
bytes = Arrays.copyOf(bytes, j);
254255
return bytes;
@@ -258,9 +259,10 @@ public static byte[] unicodeEscape(String str) {
258259
public static byte[] unicodeNonAsciiEscape(String str) {
259260
byte[] bytes = new byte[str.length() * 10];
260261
int j = 0;
261-
for (int i = 0; i < str.length(); i++) {
262+
for (int i = 0; i < str.length();) {
262263
int ch = str.codePointAt(i);
263264
j = unicodeNonAsciiEscape(ch, j, bytes);
265+
i += Character.charCount(ch);
264266
}
265267
bytes = Arrays.copyOf(bytes, j);
266268
return bytes;

0 commit comments

Comments
 (0)