Skip to content

Commit 3839819

Browse files
committed
Add support for the new f-string tokens per PEP 701 (#6659)
This PR adds support in the lexer for the newly added f-string tokens as per PEP 701. The following new tokens are added: * `FStringStart`: Token value for the start of an f-string. This includes the `f`/`F`/`fr` prefix and the opening quote(s). * `FStringMiddle`: Token value that includes the portion of text inside the f-string that's not part of the expression part and isn't an opening or closing brace. * `FStringEnd`: Token value for the end of an f-string. This includes the closing quote. Additionally, a new `Exclamation` token is added for conversion (`f"{foo!s}"`) as that's part of an expression. New test cases are added to for various possibilities using snapshot testing. The output has been verified using python/cpython@f2cc00527e. _I've put the number of f-strings for each of the following files after the file name_ ``` lexer/large/dataset.py (1) 1.05 612.6±91.60µs 66.4 MB/sec 1.00 584.7±33.72µs 69.6 MB/sec lexer/numpy/ctypeslib.py (0) 1.01 131.8±3.31µs 126.3 MB/sec 1.00 130.9±5.37µs 127.2 MB/sec lexer/numpy/globals.py (1) 1.02 13.2±0.43µs 222.7 MB/sec 1.00 13.0±0.41µs 226.8 MB/sec lexer/pydantic/types.py (8) 1.13 285.0±11.72µs 89.5 MB/sec 1.00 252.9±10.13µs 100.8 MB/sec lexer/unicode/pypinyin.py (0) 1.03 32.9±1.92µs 127.5 MB/sec 1.00 31.8±1.25µs 132.0 MB/sec ``` It seems that overall the lexer has regressed. I profiled every file mentioned above and I saw one improvement which is done in (098ee5d). But otherwise I don't see anything else. A few notes by isolating the f-string part in the profile: * As we're adding new tokens and functionality to emit them, I expect the lexer to take more time because of more code. * The `lex_fstring_middle_or_end` takes the most amount of time followed by the `current_mut` line when lexing the `:` token. The latter is to check if we're at the start of a format spec or not. * In a f-string heavy file such as https://github.com/python/cpython/blob/main/Lib/test/test_fstring.py [^1] (293), most of the time in `lex_fstring_middle_or_end` is accounted by string allocation for the string literal part of `FStringMiddle` token (https://share.firefox.dev/3ErEa1W) I don't see anything out of ordinary for `pydantic/types` profile (https://share.firefox.dev/45XcLRq) fixes: #7042 [^1]: We could add this in lexer and parser benchmark
1 parent 246d93e commit 3839819

24 files changed

+2316
-11
lines changed

Cargo.lock

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/ruff_python_parser/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ ruff_python_ast = { path = "../ruff_python_ast" }
1818
ruff_text_size = { path = "../ruff_text_size" }
1919

2020
anyhow = { workspace = true }
21+
bitflags = { workspace = true }
2122
is-macro = { workspace = true }
2223
itertools = { workspace = true }
2324
lalrpop-util = { version = "0.20.0", default-features = false }

crates/ruff_python_parser/src/lexer.rs

+411-10
Large diffs are not rendered by default.

crates/ruff_python_parser/src/lexer/cursor.rs

+12
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,18 @@ impl<'a> Cursor<'a> {
9696
}
9797
}
9898

99+
pub(super) fn eat_char3(&mut self, c1: char, c2: char, c3: char) -> bool {
100+
let mut chars = self.chars.clone();
101+
if chars.next() == Some(c1) && chars.next() == Some(c2) && chars.next() == Some(c3) {
102+
self.bump();
103+
self.bump();
104+
self.bump();
105+
true
106+
} else {
107+
false
108+
}
109+
}
110+
99111
pub(super) fn eat_if<F>(&mut self, mut predicate: F) -> Option<char>
100112
where
101113
F: FnMut(char) -> bool,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
use bitflags::bitflags;
2+
3+
use ruff_text_size::TextSize;
4+
5+
bitflags! {
6+
#[derive(Debug)]
7+
pub(crate) struct FStringContextFlags: u8 {
8+
/// The current f-string is a triple-quoted f-string i.e., the number of
9+
/// opening quotes is 3. If this flag is not set, the number of opening
10+
/// quotes is 1.
11+
const TRIPLE = 1 << 0;
12+
13+
/// The current f-string is a double-quoted f-string. If this flag is not
14+
/// set, the current f-string is a single-quoted f-string.
15+
const DOUBLE = 1 << 1;
16+
17+
/// The current f-string is a raw f-string i.e., prefixed with `r`/`R`.
18+
/// If this flag is not set, the current f-string is a normal f-string.
19+
const RAW = 1 << 2;
20+
}
21+
}
22+
23+
/// The context representing the current f-string that the lexer is in.
24+
#[derive(Debug)]
25+
pub(crate) struct FStringContext {
26+
flags: FStringContextFlags,
27+
28+
/// The level of nesting for the lexer when it entered the current f-string.
29+
/// The nesting level includes all kinds of parentheses i.e., round, square,
30+
/// and curly.
31+
nesting: u32,
32+
33+
/// The current depth of format spec for the current f-string. This is because
34+
/// there can be multiple format specs nested for the same f-string.
35+
/// For example, `{a:{b:{c}}}` has 3 format specs.
36+
format_spec_depth: u32,
37+
}
38+
39+
impl FStringContext {
40+
pub(crate) const fn new(flags: FStringContextFlags, nesting: u32) -> Self {
41+
Self {
42+
flags,
43+
nesting,
44+
format_spec_depth: 0,
45+
}
46+
}
47+
48+
pub(crate) const fn nesting(&self) -> u32 {
49+
self.nesting
50+
}
51+
52+
/// Returns the quote character for the current f-string.
53+
pub(crate) const fn quote_char(&self) -> char {
54+
if self.flags.contains(FStringContextFlags::DOUBLE) {
55+
'"'
56+
} else {
57+
'\''
58+
}
59+
}
60+
61+
/// Returns the number of quotes for the current f-string.
62+
pub(crate) const fn quote_size(&self) -> TextSize {
63+
if self.is_triple_quoted() {
64+
TextSize::new(3)
65+
} else {
66+
TextSize::new(1)
67+
}
68+
}
69+
70+
/// Returns the triple quotes for the current f-string if it is a triple-quoted
71+
/// f-string, `None` otherwise.
72+
pub(crate) const fn triple_quotes(&self) -> Option<&'static str> {
73+
if self.is_triple_quoted() {
74+
if self.flags.contains(FStringContextFlags::DOUBLE) {
75+
Some(r#"""""#)
76+
} else {
77+
Some("'''")
78+
}
79+
} else {
80+
None
81+
}
82+
}
83+
84+
/// Returns `true` if the current f-string is a raw f-string.
85+
pub(crate) const fn is_raw_string(&self) -> bool {
86+
self.flags.contains(FStringContextFlags::RAW)
87+
}
88+
89+
/// Returns `true` if the current f-string is a triple-quoted f-string.
90+
pub(crate) const fn is_triple_quoted(&self) -> bool {
91+
self.flags.contains(FStringContextFlags::TRIPLE)
92+
}
93+
94+
/// Calculates the number of open parentheses for the current f-string
95+
/// based on the current level of nesting for the lexer.
96+
const fn open_parentheses_count(&self, current_nesting: u32) -> u32 {
97+
current_nesting.saturating_sub(self.nesting)
98+
}
99+
100+
/// Returns `true` if the lexer is in a f-string expression i.e., between
101+
/// two curly braces.
102+
pub(crate) const fn is_in_expression(&self, current_nesting: u32) -> bool {
103+
self.open_parentheses_count(current_nesting) > self.format_spec_depth
104+
}
105+
106+
/// Returns `true` if the lexer is in a f-string format spec i.e., after a colon.
107+
pub(crate) const fn is_in_format_spec(&self, current_nesting: u32) -> bool {
108+
self.format_spec_depth > 0 && !self.is_in_expression(current_nesting)
109+
}
110+
111+
/// Returns `true` if the context is in a valid position to start format spec
112+
/// i.e., at the same level of nesting as the opening parentheses token.
113+
/// Increments the format spec depth if it is.
114+
///
115+
/// This assumes that the current character for the lexer is a colon (`:`).
116+
pub(crate) fn try_start_format_spec(&mut self, current_nesting: u32) -> bool {
117+
if self
118+
.open_parentheses_count(current_nesting)
119+
.saturating_sub(self.format_spec_depth)
120+
== 1
121+
{
122+
self.format_spec_depth += 1;
123+
true
124+
} else {
125+
false
126+
}
127+
}
128+
129+
/// Decrements the format spec depth unconditionally.
130+
pub(crate) fn end_format_spec(&mut self) {
131+
self.format_spec_depth = self.format_spec_depth.saturating_sub(1);
132+
}
133+
}
134+
135+
/// The f-strings stack is used to keep track of all the f-strings that the
136+
/// lexer encounters. This is necessary because f-strings can be nested.
137+
#[derive(Debug, Default)]
138+
pub(crate) struct FStrings {
139+
stack: Vec<FStringContext>,
140+
}
141+
142+
impl FStrings {
143+
pub(crate) fn push(&mut self, context: FStringContext) {
144+
self.stack.push(context);
145+
}
146+
147+
pub(crate) fn pop(&mut self) -> Option<FStringContext> {
148+
self.stack.pop()
149+
}
150+
151+
pub(crate) fn current(&self) -> Option<&FStringContext> {
152+
self.stack.last()
153+
}
154+
155+
pub(crate) fn current_mut(&mut self) -> Option<&mut FStringContext> {
156+
self.stack.last_mut()
157+
}
158+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
---
2+
source: crates/ruff_python_parser/src/lexer.rs
3+
expression: lex_source(source)
4+
---
5+
[
6+
(
7+
FStringStart,
8+
0..2,
9+
),
10+
(
11+
FStringEnd,
12+
2..3,
13+
),
14+
(
15+
String {
16+
value: "",
17+
kind: String,
18+
triple_quoted: false,
19+
},
20+
4..6,
21+
),
22+
(
23+
FStringStart,
24+
7..9,
25+
),
26+
(
27+
FStringEnd,
28+
9..10,
29+
),
30+
(
31+
FStringStart,
32+
11..13,
33+
),
34+
(
35+
FStringEnd,
36+
13..14,
37+
),
38+
(
39+
String {
40+
value: "",
41+
kind: String,
42+
triple_quoted: false,
43+
},
44+
15..17,
45+
),
46+
(
47+
FStringStart,
48+
18..22,
49+
),
50+
(
51+
FStringEnd,
52+
22..25,
53+
),
54+
(
55+
FStringStart,
56+
26..30,
57+
),
58+
(
59+
FStringEnd,
60+
30..33,
61+
),
62+
(
63+
Newline,
64+
33..33,
65+
),
66+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
---
2+
source: crates/ruff_python_parser/src/lexer.rs
3+
expression: lex_source(source)
4+
---
5+
[
6+
(
7+
FStringStart,
8+
0..2,
9+
),
10+
(
11+
FStringMiddle {
12+
value: "normal ",
13+
is_raw: false,
14+
},
15+
2..9,
16+
),
17+
(
18+
Lbrace,
19+
9..10,
20+
),
21+
(
22+
Name {
23+
name: "foo",
24+
},
25+
10..13,
26+
),
27+
(
28+
Rbrace,
29+
13..14,
30+
),
31+
(
32+
FStringMiddle {
33+
value: " {another} ",
34+
is_raw: false,
35+
},
36+
14..27,
37+
),
38+
(
39+
Lbrace,
40+
27..28,
41+
),
42+
(
43+
Name {
44+
name: "bar",
45+
},
46+
28..31,
47+
),
48+
(
49+
Rbrace,
50+
31..32,
51+
),
52+
(
53+
FStringMiddle {
54+
value: " {",
55+
is_raw: false,
56+
},
57+
32..35,
58+
),
59+
(
60+
Lbrace,
61+
35..36,
62+
),
63+
(
64+
Name {
65+
name: "three",
66+
},
67+
36..41,
68+
),
69+
(
70+
Rbrace,
71+
41..42,
72+
),
73+
(
74+
FStringMiddle {
75+
value: "}",
76+
is_raw: false,
77+
},
78+
42..44,
79+
),
80+
(
81+
FStringEnd,
82+
44..45,
83+
),
84+
(
85+
Newline,
86+
45..45,
87+
),
88+
]

0 commit comments

Comments
 (0)