Skip to content

Commit e4aaa8a

Browse files
Fix incorrect f-string tokenization (#4332)
1 parent ba88fc3 commit e4aaa8a

File tree

3 files changed

+130
-43
lines changed

3 files changed

+130
-43
lines changed

CHANGES.md

+2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626

2727
<!-- Changes to the parser or to version autodetection -->
2828

29+
- Fix regression where certain complex f-strings failed to parse (#4332)
30+
2931
### Performance
3032

3133
<!-- Changes that improve Black's performance. -->

src/blib2to3/pgen2/tokenize.py

+120-43
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,88 @@ def _split_fstring_start_and_middle(token: str) -> Tuple[str, str]:
480480
raise ValueError(f"Token {token!r} is not a valid f-string start")
481481

482482

483+
STATE_NOT_FSTRING: Final = 0 # not in an f-string
484+
STATE_MIDDLE: Final = 1 # in the string portion of an f-string (outside braces)
485+
STATE_IN_BRACES: Final = 2 # between braces in an f-string
486+
# in the format specifier (between the colon and the closing brace)
487+
STATE_IN_COLON: Final = 3
488+
489+
490+
class FStringState:
491+
"""Keeps track of state around f-strings.
492+
493+
The tokenizer should call the appropriate method on this class when
494+
it transitions to a different part of an f-string. This is needed
495+
because the tokenization depends on knowing where exactly we are in
496+
the f-string.
497+
498+
For example, consider the following f-string:
499+
500+
f"a{1:b{2}c}d"
501+
502+
The following is the tokenization of this string and the states
503+
tracked by this class:
504+
505+
1,0-1,2: FSTRING_START 'f"' # [STATE_NOT_FSTRING, STATE_MIDDLE]
506+
1,2-1,3: FSTRING_MIDDLE 'a'
507+
1,3-1,4: LBRACE '{' # [STATE_NOT_FSTRING, STATE_IN_BRACES]
508+
1,4-1,5: NUMBER '1'
509+
1,5-1,6: OP ':' # [STATE_NOT_FSTRING, STATE_IN_COLON]
510+
1,6-1,7: FSTRING_MIDDLE 'b'
511+
1,7-1,8: LBRACE '{' # [STATE_NOT_FSTRING, STATE_IN_COLON, STATE_IN_BRACES]
512+
1,8-1,9: NUMBER '2'
513+
1,9-1,10: RBRACE '}' # [STATE_NOT_FSTRING, STATE_IN_COLON]
514+
1,10-1,11: FSTRING_MIDDLE 'c'
515+
1,11-1,12: RBRACE '}' # [STATE_NOT_FSTRING, STATE_MIDDLE]
516+
1,12-1,13: FSTRING_MIDDLE 'd'
517+
1,13-1,14: FSTRING_END '"' # [STATE_NOT_FSTRING]
518+
1,14-1,15: NEWLINE '\n'
519+
2,0-2,0: ENDMARKER ''
520+
521+
Notice that the nested braces in the format specifier are represented
522+
by adding a STATE_IN_BRACES entry to the state stack. The stack is
523+
also used if there are nested f-strings.
524+
525+
"""
526+
527+
def __init__(self) -> None:
528+
self.stack: List[int] = [STATE_NOT_FSTRING]
529+
530+
def is_in_fstring_expression(self) -> bool:
531+
return self.stack[-1] not in (STATE_MIDDLE, STATE_NOT_FSTRING)
532+
533+
def current(self) -> int:
534+
return self.stack[-1]
535+
536+
def enter_fstring(self) -> None:
537+
self.stack.append(STATE_MIDDLE)
538+
539+
def leave_fstring(self) -> None:
540+
state = self.stack.pop()
541+
assert state == STATE_MIDDLE
542+
543+
def consume_lbrace(self) -> None:
544+
current_state = self.stack[-1]
545+
if current_state == STATE_MIDDLE:
546+
self.stack[-1] = STATE_IN_BRACES
547+
elif current_state == STATE_IN_COLON:
548+
self.stack.append(STATE_IN_BRACES)
549+
else:
550+
assert False, current_state
551+
552+
def consume_rbrace(self) -> None:
553+
current_state = self.stack[-1]
554+
assert current_state in (STATE_IN_BRACES, STATE_IN_COLON)
555+
if len(self.stack) > 1 and self.stack[-2] == STATE_IN_COLON:
556+
self.stack.pop()
557+
else:
558+
self.stack[-1] = STATE_MIDDLE
559+
560+
def consume_colon(self) -> None:
561+
assert self.stack[-1] == STATE_IN_BRACES, self.stack
562+
self.stack[-1] = STATE_IN_COLON
563+
564+
483565
def generate_tokens(
484566
readline: Callable[[], str], grammar: Optional[Grammar] = None
485567
) -> Iterator[GoodTokenInfo]:
@@ -498,12 +580,10 @@ def generate_tokens(
498580
and the line on which the token was found. The line passed is the
499581
logical line; continuation lines are included.
500582
"""
501-
lnum = parenlev = fstring_level = continued = 0
583+
lnum = parenlev = continued = 0
502584
parenlev_stack: List[int] = []
503-
inside_fstring_braces = False
504-
inside_fstring_colon = False
585+
fstring_state = FStringState()
505586
formatspec = ""
506-
bracelev = 0
507587
numchars: Final[str] = "0123456789"
508588
contstr, needcont = "", 0
509589
contline: Optional[str] = None
@@ -542,13 +622,15 @@ def generate_tokens(
542622
spos = strstart
543623
epos = (lnum, end)
544624
tokenline = contline + line
545-
if fstring_level == 0 and not is_fstring_start(token):
625+
if (
626+
fstring_state.current() == STATE_NOT_FSTRING
627+
and not is_fstring_start(token)
628+
):
546629
yield (STRING, token, spos, epos, tokenline)
547630
endprog_stack.pop()
548631
parenlev = parenlev_stack.pop()
549632
else:
550633
if is_fstring_start(token):
551-
fstring_level += 1
552634
fstring_start, token = _split_fstring_start_and_middle(token)
553635
fstring_start_epos = (lnum, spos[1] + len(fstring_start))
554636
yield (
@@ -558,6 +640,7 @@ def generate_tokens(
558640
fstring_start_epos,
559641
tokenline,
560642
)
643+
fstring_state.enter_fstring()
561644
# increase spos to the end of the fstring start
562645
spos = fstring_start_epos
563646

@@ -572,7 +655,7 @@ def generate_tokens(
572655
line,
573656
)
574657
yield (LBRACE, lbrace, lbrace_spos, epos, line)
575-
inside_fstring_braces = True
658+
fstring_state.consume_lbrace()
576659
else:
577660
if token.endswith(('"""', "'''")):
578661
fstring_middle, fstring_end = token[:-3], token[-3:]
@@ -594,11 +677,9 @@ def generate_tokens(
594677
epos,
595678
line,
596679
)
597-
fstring_level -= 1
680+
fstring_state.leave_fstring()
598681
endprog_stack.pop()
599682
parenlev = parenlev_stack.pop()
600-
if fstring_level > 0:
601-
inside_fstring_braces = True
602683
pos = end
603684
contstr, needcont = "", 0
604685
contline = None
@@ -619,7 +700,11 @@ def generate_tokens(
619700
continue
620701

621702
# new statement
622-
elif parenlev == 0 and not continued and not inside_fstring_braces:
703+
elif (
704+
parenlev == 0
705+
and not continued
706+
and not fstring_state.is_in_fstring_expression()
707+
):
623708
if not line:
624709
break
625710
column = 0
@@ -687,7 +772,7 @@ def generate_tokens(
687772
continued = 0
688773

689774
while pos < max:
690-
if fstring_level > 0 and not inside_fstring_braces:
775+
if fstring_state.current() == STATE_MIDDLE:
691776
endprog = endprog_stack[-1]
692777
endmatch = endprog.match(line, pos)
693778
if endmatch: # all on one line
@@ -718,14 +803,12 @@ def generate_tokens(
718803
(lnum, end),
719804
line,
720805
)
721-
fstring_level -= 1
806+
fstring_state.leave_fstring()
722807
endprog_stack.pop()
723808
parenlev = parenlev_stack.pop()
724-
if fstring_level > 0:
725-
inside_fstring_braces = True
726809
else:
727810
yield (LBRACE, "{", (lnum, end - 1), (lnum, end), line)
728-
inside_fstring_braces = True
811+
fstring_state.consume_lbrace()
729812
pos = end
730813
continue
731814
else: # multiple lines
@@ -734,7 +817,7 @@ def generate_tokens(
734817
contline = line
735818
break
736819

737-
if inside_fstring_colon:
820+
if fstring_state.current() == STATE_IN_COLON:
738821
match = fstring_middle_after_colon.match(line, pos)
739822
if match is None:
740823
formatspec += line[pos:]
@@ -754,15 +837,19 @@ def generate_tokens(
754837
formatspec = ""
755838

756839
if brace_or_nl == "{":
757-
yield (OP, "{", (lnum, brace_start), (lnum, brace_end), line)
758-
bracelev += 1
840+
yield (LBRACE, "{", (lnum, brace_start), (lnum, brace_end), line)
841+
fstring_state.consume_lbrace()
842+
end = brace_end
843+
elif brace_or_nl == "}":
844+
yield (RBRACE, "}", (lnum, brace_start), (lnum, brace_end), line)
845+
fstring_state.consume_rbrace()
759846
end = brace_end
847+
formatspec_start = (lnum, brace_end)
760848

761-
inside_fstring_colon = False
762849
pos = end
763850
continue
764851

765-
if inside_fstring_braces and parenlev == 0:
852+
if fstring_state.current() == STATE_IN_BRACES and parenlev == 0:
766853
match = bang.match(line, pos)
767854
if match:
768855
start, end = match.span(1)
@@ -774,7 +861,7 @@ def generate_tokens(
774861
if match:
775862
start, end = match.span(1)
776863
yield (OP, ":", (lnum, start), (lnum, end), line)
777-
inside_fstring_colon = True
864+
fstring_state.consume_colon()
778865
formatspec_start = (lnum, end)
779866
pos = end
780867
continue
@@ -791,7 +878,7 @@ def generate_tokens(
791878
yield (NUMBER, token, spos, epos, line)
792879
elif initial in "\r\n":
793880
newline = NEWLINE
794-
if parenlev > 0 or inside_fstring_braces:
881+
if parenlev > 0 or fstring_state.is_in_fstring_expression():
795882
newline = NL
796883
elif async_def:
797884
async_def_nl = True
@@ -813,7 +900,7 @@ def generate_tokens(
813900
parenlev = 0
814901
if is_fstring_start(token):
815902
yield (FSTRING_START, token, spos, epos, line)
816-
fstring_level += 1
903+
fstring_state.enter_fstring()
817904

818905
endmatch = endprog.match(line, pos)
819906
if endmatch: # all on one line
@@ -848,11 +935,9 @@ def generate_tokens(
848935
epos,
849936
line,
850937
)
851-
fstring_level -= 1
938+
fstring_state.leave_fstring()
852939
endprog_stack.pop()
853940
parenlev = parenlev_stack.pop()
854-
if fstring_level > 0:
855-
inside_fstring_braces = True
856941
else:
857942
fstring_middle, lbrace = token[:-1], token[-1]
858943
fstring_middle_epos = lbrace_spos = (lnum, end - 1)
@@ -864,7 +949,7 @@ def generate_tokens(
864949
line,
865950
)
866951
yield (LBRACE, lbrace, lbrace_spos, epos, line)
867-
inside_fstring_braces = True
952+
fstring_state.consume_lbrace()
868953
pos = end
869954
else:
870955
# multiple lines
@@ -919,7 +1004,7 @@ def generate_tokens(
9191004

9201005
start_epos = (lnum, start + offset)
9211006
yield (FSTRING_START, fstring_start, spos, start_epos, line)
922-
fstring_level += 1
1007+
fstring_state.enter_fstring()
9231008
endprog = endprogs[fstring_start]
9241009
endprog_stack.append(endprog)
9251010
parenlev_stack.append(parenlev)
@@ -940,16 +1025,14 @@ def generate_tokens(
9401025
end_spos = (lnum, end_offset)
9411026
end_epos = (lnum, end_offset + 1)
9421027
yield (FSTRING_END, token[-1], end_spos, end_epos, line)
943-
fstring_level -= 1
1028+
fstring_state.leave_fstring()
9441029
endprog_stack.pop()
9451030
parenlev = parenlev_stack.pop()
946-
if fstring_level > 0:
947-
inside_fstring_braces = True
9481031
else:
9491032
end_spos = (lnum, end_offset)
9501033
end_epos = (lnum, end_offset + 1)
9511034
yield (LBRACE, "{", end_spos, end_epos, line)
952-
inside_fstring_braces = True
1035+
fstring_state.consume_lbrace()
9531036

9541037
elif initial.isidentifier(): # ordinary name
9551038
if token in ("async", "await"):
@@ -998,19 +1081,13 @@ def generate_tokens(
9981081
elif (
9991082
initial == "}"
10001083
and parenlev == 0
1001-
and bracelev == 0
1002-
and fstring_level > 0
1084+
and fstring_state.is_in_fstring_expression()
10031085
):
10041086
yield (RBRACE, token, spos, epos, line)
1005-
inside_fstring_braces = False
1087+
fstring_state.consume_rbrace()
1088+
formatspec_start = epos
10061089
else:
1007-
if parenlev == 0 and bracelev > 0 and initial == "}":
1008-
bracelev -= 1
1009-
# if we're still inside fstrings, we're still part of the format spec
1010-
if inside_fstring_braces:
1011-
inside_fstring_colon = True
1012-
formatspec_start = (lnum, pos)
1013-
elif initial in "([{":
1090+
if initial in "([{":
10141091
parenlev += 1
10151092
elif initial in ")]}":
10161093
parenlev -= 1

tests/data/cases/pep_701.py

+8
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,10 @@
119119
level=0,
120120
)
121121

122+
f"{1:{f'{2}'}}"
123+
f'{1:{f'{2}'}}'
124+
f'{1:{2}d}'
125+
122126
f'{{\\"kind\\":\\"ConfigMap\\",\\"metadata\\":{{\\"annotations\\":{{}},\\"name\\":\\"cluster-info\\",\\"namespace\\":\\"amazon-cloudwatch\\"}}}}'
123127

124128
# output
@@ -243,4 +247,8 @@
243247
level=0,
244248
)
245249

250+
f"{1:{f'{2}'}}"
251+
f"{1:{f'{2}'}}"
252+
f"{1:{2}d}"
253+
246254
f'{{\\"kind\\":\\"ConfigMap\\",\\"metadata\\":{{\\"annotations\\":{{}},\\"name\\":\\"cluster-info\\",\\"namespace\\":\\"amazon-cloudwatch\\"}}}}'

0 commit comments

Comments
 (0)