Skip to content

Commit 779dcfd

Browse files
committed
Normalize unicode identifiers
1 parent c784e6d commit 779dcfd

File tree

3 files changed

+16
-4
lines changed

3 files changed

+16
-4
lines changed

graalpython/com.oracle.graal.python.pegparser/src/com/oracle/graal/python/pegparser/AbstractParser.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021, 2022, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2021, 2025, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* The Universal Permissive License (UPL), Version 1.0
@@ -52,6 +52,8 @@
5252
import java.util.List;
5353
import java.util.function.Supplier;
5454

55+
import org.graalvm.shadowed.com.ibm.icu.text.Normalizer2;
56+
5557
import com.oracle.graal.python.pegparser.sst.ArgTy;
5658
import com.oracle.graal.python.pegparser.sst.CmpOpTy;
5759
import com.oracle.graal.python.pegparser.sst.ComprehensionTy;
@@ -350,7 +352,7 @@ public Token getLastNonWhitespaceToken() {
350352
public ExprTy.Name name_token() {
351353
Token t = expect(Token.Kind.NAME);
352354
if (t != null) {
353-
return factory.createVariable(getText(t), t.sourceRange);
355+
return name_from_token(t);
354356
} else {
355357
return null;
356358
}
@@ -504,6 +506,13 @@ public ExprTy.Name name_from_token(Token t) {
504506
return null;
505507
}
506508
String id = getText(t);
509+
for (int i = 0; i < id.length(); i++) {
510+
if (id.charAt(i) > 0xff) {
511+
// If the identifier is not ASCII, normalize it according to PEP 3131
512+
id = Normalizer2.getNFKCInstance().normalize(id);
513+
break;
514+
}
515+
}
507516
return factory.createVariable(id, t.sourceRange);
508517
}
509518

graalpython/com.oracle.graal.python.pegparser/src/com/oracle/graal/python/pegparser/tokenizer/Tokenizer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* Copyright (c) 2021, 2024, Oracle and/or its affiliates.
1+
/* Copyright (c) 2021, 2025, Oracle and/or its affiliates.
22
* Copyright (C) 1996-2021 Python Software Foundation
33
*
44
* Licensed under the PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2

graalpython/com.oracle.graal.python.test/src/tests/test_ast.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2022, 2022, Oracle and/or its affiliates. All rights reserved.
1+
# Copyright (c) 2022, 2025, Oracle and/or its affiliates. All rights reserved.
22
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
33
#
44
# The Universal Permissive License (UPL), Version 1.0
@@ -181,6 +181,9 @@ def test_unparse_bytes_constant_kind(self):
181181
exec(compile(tree, '<string>', 'exec'), vars)
182182
self.assertEqual("u'abc'", vars['f'].__annotations__['x'])
183183

184+
def test_parse_unicode(self):
185+
self.assertEqual(ast.parse("𝕦𝕟𝕚𝕔𝕠𝕕𝕖").body[0].value.id, 'unicode')
186+
184187

185188
if __name__ == '__main__':
186189
unittest.main()

0 commit comments

Comments
 (0)