Skip to content

Commit 692d36d

Browse files
jimmodpgeorge
authored andcommitted
py: Implement partial PEP-498 (f-string) support.
This implements (most of) the PEP-498 spec for f-strings and is based on micropython#4998 by @klardotsh. It is implemented in the lexer as a syntax translation to `str.format`: f"{a}" --> "{}".format(a) It also supports: f"{a=}" --> "a={}".format(a) This is done by extracting the arguments into a temporary vstr buffer, then after the string has been tokenized, the lexer input queue is saved and the contents of the temporary vstr buffer are injected into the lexer instead. There are four main limitations: - raw f-strings (`fr` or `rf` prefixes) are not supported and will raise `SyntaxError: raw f-strings are not supported`. - literal concatenation of f-strings with adjacent strings will fail "{}" f"{a}" --> "{}{}".format(a) (str.format will incorrectly use the braces from the non-f-string) f"{a}" f"{a}" --> "{}".format(a) "{}".format(a) (cannot concatenate) - PEP-498 requires the full parser to understand the interpolated argument, however because this entirely runs in the lexer it cannot resolve nested braces in expressions like f"{'}'}" - The !r, !s, and !a conversions are not supported. Includes tests and cpydiffs. Signed-off-by: Jim Mussared <[email protected]>
1 parent 162bf3c commit 692d36d

18 files changed

+292
-8
lines changed

mpy-cross/mpconfigport.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@
7676
#define MICROPY_CPYTHON_COMPAT (1)
7777
#define MICROPY_USE_INTERNAL_PRINTF (0)
7878

79+
#define MICROPY_PY_FSTRINGS (1)
7980
#define MICROPY_PY_BUILTINS_STR_UNICODE (1)
8081

8182
#if !(defined(MICROPY_GCREGS_SETJMP) || defined(__x86_64__) || defined(__i386__) || defined(__thumb2__) || defined(__thumb__) || defined(__arm__))

ports/unix/mpconfigport.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@
8888
#define MICROPY_PY_FUNCTION_ATTRS (1)
8989
#define MICROPY_PY_DESCRIPTORS (1)
9090
#define MICROPY_PY_DELATTR_SETATTR (1)
91+
#define MICROPY_PY_FSTRINGS (1)
9192
#define MICROPY_PY_BUILTINS_STR_UNICODE (1)
9293
#define MICROPY_PY_BUILTINS_STR_CENTER (1)
9394
#define MICROPY_PY_BUILTINS_STR_PARTITION (1)

ports/windows/mpconfigport.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
#define MICROPY_PY_FUNCTION_ATTRS (1)
6767
#define MICROPY_PY_DESCRIPTORS (1)
6868
#define MICROPY_PY_DELATTR_SETATTR (1)
69+
#define MICROPY_PY_FSTRINGS (1)
6970
#define MICROPY_PY_BUILTINS_STR_UNICODE (1)
7071
#define MICROPY_PY_BUILTINS_STR_CENTER (1)
7172
#define MICROPY_PY_BUILTINS_STR_PARTITION (1)

py/lexer.c

Lines changed: 125 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,12 @@ STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
6262
return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
6363
}
6464

65+
#if MICROPY_PY_FSTRINGS
66+
STATIC bool is_char_or4(mp_lexer_t *lex, byte c1, byte c2, byte c3, byte c4) {
67+
return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3 || lex->chr0 == c4;
68+
}
69+
#endif
70+
6571
STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
6672
return lex->chr1 == c;
6773
}
@@ -105,7 +111,13 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) {
105111

106112
STATIC bool is_string_or_bytes(mp_lexer_t *lex) {
107113
return is_char_or(lex, '\'', '\"')
114+
#if MICROPY_PY_FSTRINGS
115+
|| (is_char_or4(lex, 'r', 'u', 'b', 'f') && is_char_following_or(lex, '\'', '\"'))
116+
|| (((is_char_and(lex, 'r', 'f') || is_char_and(lex, 'f', 'r'))
117+
&& is_char_following_following_or(lex, '\'', '\"')))
118+
#else
108119
|| (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
120+
#endif
109121
|| ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))
110122
&& is_char_following_following_or(lex, '\'', '\"'));
111123
}
@@ -132,9 +144,35 @@ STATIC void next_char(mp_lexer_t *lex) {
132144
++lex->column;
133145
}
134146

147+
// shift the input queue forward
135148
lex->chr0 = lex->chr1;
136149
lex->chr1 = lex->chr2;
137-
lex->chr2 = lex->reader.readbyte(lex->reader.data);
150+
151+
// and add the next byte from either the fstring args or the reader
152+
#if MICROPY_PY_FSTRINGS
153+
if (lex->fstring_args_idx) {
154+
// if there are saved chars, then we're currently injecting fstring args
155+
if (lex->fstring_args_idx < lex->fstring_args.len) {
156+
lex->chr2 = lex->fstring_args.buf[lex->fstring_args_idx++];
157+
} else {
158+
// no more fstring arg bytes
159+
lex->chr2 = '\0';
160+
}
161+
162+
if (lex->chr0 == '\0') {
163+
// consumed all fstring data, restore saved input queue
164+
lex->chr0 = lex->chr0_saved;
165+
lex->chr1 = lex->chr1_saved;
166+
lex->chr2 = lex->chr2_saved;
167+
// stop consuming fstring arg data
168+
vstr_reset(&lex->fstring_args);
169+
lex->fstring_args_idx = 0;
170+
}
171+
} else
172+
#endif
173+
{
174+
lex->chr2 = lex->reader.readbyte(lex->reader.data);
175+
}
138176

139177
if (lex->chr1 == '\r') {
140178
// CR is a new line, converted to LF
@@ -272,7 +310,7 @@ STATIC bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) {
272310
return true;
273311
}
274312

275-
STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
313+
STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) {
276314
// get first quoting character
277315
char quote_char = '\'';
278316
if (is_char(lex, '\"')) {
@@ -293,12 +331,57 @@ STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
293331
}
294332

295333
size_t n_closing = 0;
334+
#if MICROPY_PY_FSTRINGS
335+
if (is_fstring) {
336+
// assume there's going to be interpolation, so prep the injection data
337+
// fstring_args_idx==0 && len(fstring_args)>0 means we're extracting the args.
338+
// only when fstring_args_idx>0 will we consume the arg data
339+
// note: lex->fstring_args will be empty already (it's reset when finished)
340+
vstr_add_str(&lex->fstring_args, ".format(");
341+
}
342+
#endif
343+
296344
while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
297345
if (is_char(lex, quote_char)) {
298346
n_closing += 1;
299347
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
300348
} else {
301349
n_closing = 0;
350+
351+
#if MICROPY_PY_FSTRINGS
352+
while (is_fstring && is_char(lex, '{')) {
353+
next_char(lex);
354+
if (is_char(lex, '{')) {
355+
// "{{" is passed through unchanged to be handled by str.format
356+
vstr_add_byte(&lex->vstr, '{');
357+
next_char(lex);
358+
} else {
359+
// remember the start of this argument (if we need it for f'{a=}').
360+
size_t i = lex->fstring_args.len;
361+
// extract characters inside the { until we reach the
362+
// format specifier or closing }.
363+
// (MicroPython limitation) note: this is completely unaware of
364+
// Python syntax and will not handle any expression containing '}' or ':'.
365+
// e.g. f'{"}"}' or f'{foo({})}'.
366+
while (!is_end(lex) && !is_char_or(lex, ':', '}')) {
367+
// like the default case at the end of this function, stay 8-bit clean
368+
vstr_add_byte(&lex->fstring_args, CUR_CHAR(lex));
369+
next_char(lex);
370+
}
371+
if (lex->fstring_args.buf[lex->fstring_args.len - 1] == '=') {
372+
// if the last character of the arg was '=', then inject "arg=" before the '{'.
373+
// f'{a=}' --> 'a={}'.format(a)
374+
vstr_add_strn(&lex->vstr, lex->fstring_args.buf + i, lex->fstring_args.len - i);
375+
// remove the trailing '='
376+
lex->fstring_args.len--;
377+
}
378+
// comma-separate args
379+
vstr_add_byte(&lex->fstring_args, ',');
380+
}
381+
vstr_add_byte(&lex->vstr, '{');
382+
}
383+
#endif
384+
302385
if (is_char(lex, '\\')) {
303386
next_char(lex);
304387
unichar c = CUR_CHAR(lex);
@@ -451,6 +534,23 @@ STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
451534
}
452535

453536
void mp_lexer_to_next(mp_lexer_t *lex) {
537+
#if MICROPY_PY_FSTRINGS
538+
if (lex->fstring_args.len && lex->fstring_args_idx == 0) {
539+
// moving onto the next token means the literal string is complete.
540+
// switch into injecting the format args.
541+
vstr_add_byte(&lex->fstring_args, ')');
542+
lex->chr0_saved = lex->chr0;
543+
lex->chr1_saved = lex->chr1;
544+
lex->chr2_saved = lex->chr2;
545+
lex->chr0 = lex->fstring_args.buf[0];
546+
lex->chr1 = lex->fstring_args.buf[1];
547+
lex->chr2 = lex->fstring_args.buf[2];
548+
// we've already extracted 3 chars, but setting this non-zero also
549+
// means we'll start consuming the fstring data
550+
lex->fstring_args_idx = 3;
551+
}
552+
#endif
553+
454554
// start new token text
455555
vstr_reset(&lex->vstr);
456556

@@ -506,6 +606,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
506606
do {
507607
// parse type codes
508608
bool is_raw = false;
609+
bool is_fstring = false;
509610
mp_token_kind_t kind = MP_TOKEN_STRING;
510611
int n_char = 0;
511612
if (is_char(lex, 'u')) {
@@ -524,7 +625,25 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
524625
kind = MP_TOKEN_BYTES;
525626
n_char = 2;
526627
}
628+
#if MICROPY_PY_FSTRINGS
629+
if (is_char_following(lex, 'f')) {
630+
// raw-f-strings unsupported, immediately return (invalid) token.
631+
lex->tok_kind = MP_TOKEN_FSTRING_RAW;
632+
break;
633+
}
634+
#endif
635+
}
636+
#if MICROPY_PY_FSTRINGS
637+
else if (is_char(lex, 'f')) {
638+
if (is_char_following(lex, 'r')) {
639+
// raw-f-strings unsupported, immediately return (invalid) token.
640+
lex->tok_kind = MP_TOKEN_FSTRING_RAW;
641+
break;
642+
}
643+
n_char = 1;
644+
is_fstring = true;
527645
}
646+
#endif
528647

529648
// Set or check token kind
530649
if (lex->tok_kind == MP_TOKEN_END) {
@@ -543,7 +662,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
543662
}
544663

545664
// Parse the literal
546-
parse_string_literal(lex, is_raw);
665+
parse_string_literal(lex, is_raw, is_fstring);
547666

548667
// Skip whitespace so we can check if there's another string following
549668
skip_whitespace(lex, true);
@@ -703,6 +822,9 @@ mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
703822
lex->num_indent_level = 1;
704823
lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
705824
vstr_init(&lex->vstr, 32);
825+
#if MICROPY_PY_FSTRINGS
826+
vstr_init(&lex->fstring_args, 0);
827+
#endif
706828

707829
// store sentinel for first indentation level
708830
lex->indent_level[0] = 0;

py/lexer.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,10 @@ typedef enum _mp_token_kind_t {
4444
MP_TOKEN_INVALID,
4545
MP_TOKEN_DEDENT_MISMATCH,
4646
MP_TOKEN_LONELY_STRING_OPEN,
47+
#if MICROPY_PY_FSTRINGS
48+
MP_TOKEN_MALFORMED_FSTRING,
49+
MP_TOKEN_FSTRING_RAW,
50+
#endif
4751

4852
MP_TOKEN_NEWLINE,
4953
MP_TOKEN_INDENT,
@@ -158,6 +162,9 @@ typedef struct _mp_lexer_t {
158162
mp_reader_t reader; // stream source
159163

160164
unichar chr0, chr1, chr2; // current cached characters from source
165+
#if MICROPY_PY_FSTRINGS
166+
unichar chr0_saved, chr1_saved, chr2_saved; // current cached characters from alt source
167+
#endif
161168

162169
size_t line; // current source line
163170
size_t column; // current source column
@@ -173,6 +180,10 @@ typedef struct _mp_lexer_t {
173180
size_t tok_column; // token source column
174181
mp_token_kind_t tok_kind; // token kind
175182
vstr_t vstr; // token data
183+
#if MICROPY_PY_FSTRINGS
184+
vstr_t fstring_args; // extracted arguments to pass to .format()
185+
size_t fstring_args_idx; // how many bytes of fstring_args have been read
186+
#endif
176187
} mp_lexer_t;
177188

178189
mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader);

py/mpconfig.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -875,6 +875,11 @@ typedef double mp_float_t;
875875
#define MICROPY_PY_ASYNC_AWAIT (1)
876876
#endif
877877

878+
// Support for literal string interpolation, f-strings (see PEP 498, Python 3.6+)
879+
#ifndef MICROPY_PY_FSTRINGS
880+
#define MICROPY_PY_FSTRINGS (0)
881+
#endif
882+
878883
// Support for assignment expressions with := (see PEP 572, Python 3.8+)
879884
#ifndef MICROPY_PY_ASSIGN_EXPR
880885
#define MICROPY_PY_ASSIGN_EXPR (1)

py/parse.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1152,6 +1152,14 @@ mp_parse_tree_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind) {
11521152
} else if (lex->tok_kind == MP_TOKEN_DEDENT_MISMATCH) {
11531153
exc = mp_obj_new_exception_msg(&mp_type_IndentationError,
11541154
MP_ERROR_TEXT("unindent doesn't match any outer indent level"));
1155+
#if MICROPY_PY_FSTRINGS
1156+
} else if (lex->tok_kind == MP_TOKEN_MALFORMED_FSTRING) {
1157+
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
1158+
MP_ERROR_TEXT("malformed f-string"));
1159+
} else if (lex->tok_kind == MP_TOKEN_FSTRING_RAW) {
1160+
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
1161+
MP_ERROR_TEXT("raw f-strings are not supported"));
1162+
#endif
11551163
} else {
11561164
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
11571165
MP_ERROR_TEXT("invalid syntax"));

tests/basics/string_fstring.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
def f():
2+
return 4
3+
def g(_):
4+
return 5
5+
def h():
6+
return 6
7+
8+
print(f'no interpolation')
9+
print(f"no interpolation")
10+
print(f"""no interpolation""")
11+
12+
x, y = 1, 2
13+
print(f'{x}')
14+
print(f'{x:08x}')
15+
print(f'{x=}')
16+
print(f'{x=:08x}')
17+
print(f'a {x} b {y} c')
18+
print(f'a {x:08x} b {y} c')
19+
print(f'a {x=} b {y} c')
20+
print(f'a {x=:08x} b {y} c')
21+
22+
print(f'a {"hello"} b')
23+
print(f'a {f() + g("foo") + h()} b')
24+
print(f'a {f() + g("foo") + h()=} b')
25+
print(f'a {f() + g("foo") + h()=:08x} b')
26+
27+
def foo(a, b):
28+
return f'{x}{y}{a}{b}'
29+
print(foo(7, 8))
30+
31+
# PEP-0498 specifies that '\\' and '#' must be disallowed explicitly, whereas
32+
# MicroPython relies on the syntax error as a result of the substitution.
33+
34+
print(f"\\")
35+
print(f'#')
36+
try:
37+
eval("f'{\}'")
38+
except SyntaxError:
39+
print('SyntaxError')
40+
try:
41+
eval("f'{#}'")
42+
except SyntaxError:
43+
print('SyntaxError')
44+
45+
46+
# PEP-0498 specifies that handling of double braces '{{' or '}}' should
47+
# behave like str.format.
48+
print(f'{{}}')
49+
print(f'{{{4*10}}}', '{40}')
50+
51+
# A single closing brace, unlike str.format should raise a syntax error.
52+
# MicroPython instead raises ValueError at runtime from the substitution.
53+
try:
54+
eval("f'{{}'")
55+
except (ValueError, SyntaxError):
56+
# MicroPython incorrectly raises ValueError here.
57+
print('SyntaxError')

tests/cmdline/cmd_parsetree.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@
1010
e = b"a very long bytes that will not be interned"
1111
f = 123456789012345678901234567890
1212
g = 123
13+
h = f"fstring: '{b}'"

0 commit comments

Comments
 (0)