@@ -480,6 +480,88 @@ def _split_fstring_start_and_middle(token: str) -> Tuple[str, str]:
480
480
raise ValueError (f"Token { token !r} is not a valid f-string start" )
481
481
482
482
483
+ STATE_NOT_FSTRING : Final = 0 # not in an f-string
484
+ STATE_MIDDLE : Final = 1 # in the string portion of an f-string (outside braces)
485
+ STATE_IN_BRACES : Final = 2 # between braces in an f-string
486
+ # in the format specifier (between the colon and the closing brace)
487
+ STATE_IN_COLON : Final = 3
488
+
489
+
490
+ class FStringState :
491
+ """Keeps track of state around f-strings.
492
+
493
+ The tokenizer should call the appropriate method on this class when
494
+ it transitions to a different part of an f-string. This is needed
495
+ because the tokenization depends on knowing where exactly we are in
496
+ the f-string.
497
+
498
+ For example, consider the following f-string:
499
+
500
+ f"a{1:b{2}c}d"
501
+
502
+ The following is the tokenization of this string and the states
503
+ tracked by this class:
504
+
505
+ 1,0-1,2: FSTRING_START 'f"' # [STATE_NOT_FSTRING, STATE_MIDDLE]
506
+ 1,2-1,3: FSTRING_MIDDLE 'a'
507
+ 1,3-1,4: LBRACE '{' # [STATE_NOT_FSTRING, STATE_IN_BRACES]
508
+ 1,4-1,5: NUMBER '1'
509
+ 1,5-1,6: OP ':' # [STATE_NOT_FSTRING, STATE_IN_COLON]
510
+ 1,6-1,7: FSTRING_MIDDLE 'b'
511
+ 1,7-1,8: LBRACE '{' # [STATE_NOT_FSTRING, STATE_IN_COLON, STATE_IN_BRACES]
512
+ 1,8-1,9: NUMBER '2'
513
+ 1,9-1,10: RBRACE '}' # [STATE_NOT_FSTRING, STATE_IN_COLON]
514
+ 1,10-1,11: FSTRING_MIDDLE 'c'
515
+ 1,11-1,12: RBRACE '}' # [STATE_NOT_FSTRING, STATE_MIDDLE]
516
+ 1,12-1,13: FSTRING_MIDDLE 'd'
517
+ 1,13-1,14: FSTRING_END '"' # [STATE_NOT_FSTRING]
518
+ 1,14-1,15: NEWLINE '\n '
519
+ 2,0-2,0: ENDMARKER ''
520
+
521
+ Notice that the nested braces in the format specifier are represented
522
+ by adding a STATE_IN_BRACES entry to the state stack. The stack is
523
+ also used if there are nested f-strings.
524
+
525
+ """
526
+
527
+ def __init__ (self ) -> None :
528
+ self .stack : List [int ] = [STATE_NOT_FSTRING ]
529
+
530
+ def is_in_fstring_expression (self ) -> bool :
531
+ return self .stack [- 1 ] not in (STATE_MIDDLE , STATE_NOT_FSTRING )
532
+
533
+ def current (self ) -> int :
534
+ return self .stack [- 1 ]
535
+
536
+ def enter_fstring (self ) -> None :
537
+ self .stack .append (STATE_MIDDLE )
538
+
539
+ def leave_fstring (self ) -> None :
540
+ state = self .stack .pop ()
541
+ assert state == STATE_MIDDLE
542
+
543
+ def consume_lbrace (self ) -> None :
544
+ current_state = self .stack [- 1 ]
545
+ if current_state == STATE_MIDDLE :
546
+ self .stack [- 1 ] = STATE_IN_BRACES
547
+ elif current_state == STATE_IN_COLON :
548
+ self .stack .append (STATE_IN_BRACES )
549
+ else :
550
+ assert False , current_state
551
+
552
+ def consume_rbrace (self ) -> None :
553
+ current_state = self .stack [- 1 ]
554
+ assert current_state in (STATE_IN_BRACES , STATE_IN_COLON )
555
+ if len (self .stack ) > 1 and self .stack [- 2 ] == STATE_IN_COLON :
556
+ self .stack .pop ()
557
+ else :
558
+ self .stack [- 1 ] = STATE_MIDDLE
559
+
560
+ def consume_colon (self ) -> None :
561
+ assert self .stack [- 1 ] == STATE_IN_BRACES , self .stack
562
+ self .stack [- 1 ] = STATE_IN_COLON
563
+
564
+
483
565
def generate_tokens (
484
566
readline : Callable [[], str ], grammar : Optional [Grammar ] = None
485
567
) -> Iterator [GoodTokenInfo ]:
@@ -498,12 +580,10 @@ def generate_tokens(
498
580
and the line on which the token was found. The line passed is the
499
581
logical line; continuation lines are included.
500
582
"""
501
- lnum = parenlev = fstring_level = continued = 0
583
+ lnum = parenlev = continued = 0
502
584
parenlev_stack : List [int ] = []
503
- inside_fstring_braces = False
504
- inside_fstring_colon = False
585
+ fstring_state = FStringState ()
505
586
formatspec = ""
506
- bracelev = 0
507
587
numchars : Final [str ] = "0123456789"
508
588
contstr , needcont = "" , 0
509
589
contline : Optional [str ] = None
@@ -542,13 +622,15 @@ def generate_tokens(
542
622
spos = strstart
543
623
epos = (lnum , end )
544
624
tokenline = contline + line
545
- if fstring_level == 0 and not is_fstring_start (token ):
625
+ if (
626
+ fstring_state .current () == STATE_NOT_FSTRING
627
+ and not is_fstring_start (token )
628
+ ):
546
629
yield (STRING , token , spos , epos , tokenline )
547
630
endprog_stack .pop ()
548
631
parenlev = parenlev_stack .pop ()
549
632
else :
550
633
if is_fstring_start (token ):
551
- fstring_level += 1
552
634
fstring_start , token = _split_fstring_start_and_middle (token )
553
635
fstring_start_epos = (lnum , spos [1 ] + len (fstring_start ))
554
636
yield (
@@ -558,6 +640,7 @@ def generate_tokens(
558
640
fstring_start_epos ,
559
641
tokenline ,
560
642
)
643
+ fstring_state .enter_fstring ()
561
644
# increase spos to the end of the fstring start
562
645
spos = fstring_start_epos
563
646
@@ -572,7 +655,7 @@ def generate_tokens(
572
655
line ,
573
656
)
574
657
yield (LBRACE , lbrace , lbrace_spos , epos , line )
575
- inside_fstring_braces = True
658
+ fstring_state . consume_lbrace ()
576
659
else :
577
660
if token .endswith (('"""' , "'''" )):
578
661
fstring_middle , fstring_end = token [:- 3 ], token [- 3 :]
@@ -594,11 +677,9 @@ def generate_tokens(
594
677
epos ,
595
678
line ,
596
679
)
597
- fstring_level -= 1
680
+ fstring_state . leave_fstring ()
598
681
endprog_stack .pop ()
599
682
parenlev = parenlev_stack .pop ()
600
- if fstring_level > 0 :
601
- inside_fstring_braces = True
602
683
pos = end
603
684
contstr , needcont = "" , 0
604
685
contline = None
@@ -619,7 +700,11 @@ def generate_tokens(
619
700
continue
620
701
621
702
# new statement
622
- elif parenlev == 0 and not continued and not inside_fstring_braces :
703
+ elif (
704
+ parenlev == 0
705
+ and not continued
706
+ and not fstring_state .is_in_fstring_expression ()
707
+ ):
623
708
if not line :
624
709
break
625
710
column = 0
@@ -687,7 +772,7 @@ def generate_tokens(
687
772
continued = 0
688
773
689
774
while pos < max :
690
- if fstring_level > 0 and not inside_fstring_braces :
775
+ if fstring_state . current () == STATE_MIDDLE :
691
776
endprog = endprog_stack [- 1 ]
692
777
endmatch = endprog .match (line , pos )
693
778
if endmatch : # all on one line
@@ -718,14 +803,12 @@ def generate_tokens(
718
803
(lnum , end ),
719
804
line ,
720
805
)
721
- fstring_level -= 1
806
+ fstring_state . leave_fstring ()
722
807
endprog_stack .pop ()
723
808
parenlev = parenlev_stack .pop ()
724
- if fstring_level > 0 :
725
- inside_fstring_braces = True
726
809
else :
727
810
yield (LBRACE , "{" , (lnum , end - 1 ), (lnum , end ), line )
728
- inside_fstring_braces = True
811
+ fstring_state . consume_lbrace ()
729
812
pos = end
730
813
continue
731
814
else : # multiple lines
@@ -734,7 +817,7 @@ def generate_tokens(
734
817
contline = line
735
818
break
736
819
737
- if inside_fstring_colon :
820
+ if fstring_state . current () == STATE_IN_COLON :
738
821
match = fstring_middle_after_colon .match (line , pos )
739
822
if match is None :
740
823
formatspec += line [pos :]
@@ -754,15 +837,19 @@ def generate_tokens(
754
837
formatspec = ""
755
838
756
839
if brace_or_nl == "{" :
757
- yield (OP , "{" , (lnum , brace_start ), (lnum , brace_end ), line )
758
- bracelev += 1
840
+ yield (LBRACE , "{" , (lnum , brace_start ), (lnum , brace_end ), line )
841
+ fstring_state .consume_lbrace ()
842
+ end = brace_end
843
+ elif brace_or_nl == "}" :
844
+ yield (RBRACE , "}" , (lnum , brace_start ), (lnum , brace_end ), line )
845
+ fstring_state .consume_rbrace ()
759
846
end = brace_end
847
+ formatspec_start = (lnum , brace_end )
760
848
761
- inside_fstring_colon = False
762
849
pos = end
763
850
continue
764
851
765
- if inside_fstring_braces and parenlev == 0 :
852
+ if fstring_state . current () == STATE_IN_BRACES and parenlev == 0 :
766
853
match = bang .match (line , pos )
767
854
if match :
768
855
start , end = match .span (1 )
@@ -774,7 +861,7 @@ def generate_tokens(
774
861
if match :
775
862
start , end = match .span (1 )
776
863
yield (OP , ":" , (lnum , start ), (lnum , end ), line )
777
- inside_fstring_colon = True
864
+ fstring_state . consume_colon ()
778
865
formatspec_start = (lnum , end )
779
866
pos = end
780
867
continue
@@ -791,7 +878,7 @@ def generate_tokens(
791
878
yield (NUMBER , token , spos , epos , line )
792
879
elif initial in "\r \n " :
793
880
newline = NEWLINE
794
- if parenlev > 0 or inside_fstring_braces :
881
+ if parenlev > 0 or fstring_state . is_in_fstring_expression () :
795
882
newline = NL
796
883
elif async_def :
797
884
async_def_nl = True
@@ -813,7 +900,7 @@ def generate_tokens(
813
900
parenlev = 0
814
901
if is_fstring_start (token ):
815
902
yield (FSTRING_START , token , spos , epos , line )
816
- fstring_level += 1
903
+ fstring_state . enter_fstring ()
817
904
818
905
endmatch = endprog .match (line , pos )
819
906
if endmatch : # all on one line
@@ -848,11 +935,9 @@ def generate_tokens(
848
935
epos ,
849
936
line ,
850
937
)
851
- fstring_level -= 1
938
+ fstring_state . leave_fstring ()
852
939
endprog_stack .pop ()
853
940
parenlev = parenlev_stack .pop ()
854
- if fstring_level > 0 :
855
- inside_fstring_braces = True
856
941
else :
857
942
fstring_middle , lbrace = token [:- 1 ], token [- 1 ]
858
943
fstring_middle_epos = lbrace_spos = (lnum , end - 1 )
@@ -864,7 +949,7 @@ def generate_tokens(
864
949
line ,
865
950
)
866
951
yield (LBRACE , lbrace , lbrace_spos , epos , line )
867
- inside_fstring_braces = True
952
+ fstring_state . consume_lbrace ()
868
953
pos = end
869
954
else :
870
955
# multiple lines
@@ -919,7 +1004,7 @@ def generate_tokens(
919
1004
920
1005
start_epos = (lnum , start + offset )
921
1006
yield (FSTRING_START , fstring_start , spos , start_epos , line )
922
- fstring_level += 1
1007
+ fstring_state . enter_fstring ()
923
1008
endprog = endprogs [fstring_start ]
924
1009
endprog_stack .append (endprog )
925
1010
parenlev_stack .append (parenlev )
@@ -940,16 +1025,14 @@ def generate_tokens(
940
1025
end_spos = (lnum , end_offset )
941
1026
end_epos = (lnum , end_offset + 1 )
942
1027
yield (FSTRING_END , token [- 1 ], end_spos , end_epos , line )
943
- fstring_level -= 1
1028
+ fstring_state . leave_fstring ()
944
1029
endprog_stack .pop ()
945
1030
parenlev = parenlev_stack .pop ()
946
- if fstring_level > 0 :
947
- inside_fstring_braces = True
948
1031
else :
949
1032
end_spos = (lnum , end_offset )
950
1033
end_epos = (lnum , end_offset + 1 )
951
1034
yield (LBRACE , "{" , end_spos , end_epos , line )
952
- inside_fstring_braces = True
1035
+ fstring_state . consume_lbrace ()
953
1036
954
1037
elif initial .isidentifier (): # ordinary name
955
1038
if token in ("async" , "await" ):
@@ -998,19 +1081,13 @@ def generate_tokens(
998
1081
elif (
999
1082
initial == "}"
1000
1083
and parenlev == 0
1001
- and bracelev == 0
1002
- and fstring_level > 0
1084
+ and fstring_state .is_in_fstring_expression ()
1003
1085
):
1004
1086
yield (RBRACE , token , spos , epos , line )
1005
- inside_fstring_braces = False
1087
+ fstring_state .consume_rbrace ()
1088
+ formatspec_start = epos
1006
1089
else :
1007
- if parenlev == 0 and bracelev > 0 and initial == "}" :
1008
- bracelev -= 1
1009
- # if we're still inside fstrings, we're still part of the format spec
1010
- if inside_fstring_braces :
1011
- inside_fstring_colon = True
1012
- formatspec_start = (lnum , pos )
1013
- elif initial in "([{" :
1090
+ if initial in "([{" :
1014
1091
parenlev += 1
1015
1092
elif initial in ")]}" :
1016
1093
parenlev -= 1
0 commit comments