perf(es/fast-lexer): Optimize bound checks (#10157)

**Description:** These bound checks are unnecessary, and the performance optimization has a noticeable impact on mac os. <img width="1840" alt="image" src="https://github.com/user-attachments/assets/12073f7c-f0bb-4e8c-81d0-9176312a15d5" />
swc-project · Mar 6, 2025 · d74360e · d74360e
1 parent fb610b0
commit d74360e
Show file tree

Hide file tree

Showing 6 changed files with 73 additions and 44 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -24,6 +24,7 @@ resolver = "2"
   ascii                     = "1.1.0"
   assert_cmd                = "2.0.12"
   assert_fs                 = "1.0.13"
+  assume                    = "0.5.0"
   auto_impl                 = "1.2.0"
   backtrace                 = "0.3.61"
   base64                    = "0.22.1"

diff --git a/crates/swc_ecma_fast_parser/Cargo.toml b/crates/swc_ecma_fast_parser/Cargo.toml
@@ -18,6 +18,7 @@ swc_atoms    = { version = "5.0.0", path = "../swc_atoms" }
 swc_common   = { version = "8.0.0", path = "../swc_common" }
 swc_ecma_ast = { version = "8.0.0", path = "../swc_ecma_ast" }
 
+assume     = { workspace = true }
 num-bigint = { workspace = true }
 phf        = { workspace = true, features = ["macros"] }
 wide       = { workspace = true }

diff --git a/crates/swc_ecma_fast_parser/src/lexer/cursor.rs b/crates/swc_ecma_fast_parser/src/lexer/cursor.rs
@@ -2,10 +2,11 @@
 //!
 //! This cursor operates directly on UTF-8 bytes for maximum performance.
 
+use assume::assume;
 use swc_common::BytePos;
 use wide::u8x16;
 
-use crate::util::{likely, unlikely};
+use crate::util::unlikely;
 
 /// High-performance cursor for traversing input bytes
 #[repr(C)] // Ensure predictable memory layout for better cache behavior
@@ -78,15 +79,15 @@ impl<'a> Cursor<'a> {
     /// Advance the cursor by one byte
     #[inline(always)]
     pub fn advance(&mut self) {
-        if likely(!self.is_eof()) {
-            self.pos += 1;
-        }
+        assume!(unsafe: !self.is_eof());
+        self.pos += 1;
     }
 
     /// Advance the cursor by n bytes
     #[inline(always)]
     pub fn advance_n(&mut self, n: u32) {
-        self.pos = (self.pos + n).min(self.len);
+        assume!(unsafe: self.pos + n <= self.len);
+        self.pos += n;
     }
 
     /// Advance until the predicate returns false or EOF is reached

diff --git a/crates/swc_ecma_fast_parser/src/lexer/identifier.rs b/crates/swc_ecma_fast_parser/src/lexer/identifier.rs
@@ -331,7 +331,8 @@ impl Lexer<'_> {
             match len {
                 // Direct lookup for 2-letter keywords
                 2 => {
-                    let word_bytes = [ident_bytes[0], ident_bytes[1]];
+                    let word_bytes =
+                        unsafe { [*ident_bytes.get_unchecked(0), *ident_bytes.get_unchecked(1)] };
                     let word_value = u16::from_be_bytes(word_bytes);
 
                     for &(keyword_value, token_type) in &KEYWORDS_LEN2 {
@@ -348,7 +349,14 @@ impl Lexer<'_> {
 
                 // Direct lookup for 3-letter keywords
                 3 => {
-                    let word_bytes = [ident_bytes[0], ident_bytes[1], ident_bytes[2], 0];
+                    let word_bytes = unsafe {
+                        [
+                            *ident_bytes.get_unchecked(0),
+                            *ident_bytes.get_unchecked(1),
+                            *ident_bytes.get_unchecked(2),
+                            0,
+                        ]
+                    };
                     let word_value = (u32::from_be_bytes(word_bytes)) >> 8;
 
                     for &(keyword_value, token_type) in &KEYWORDS_LEN3 {
@@ -365,12 +373,14 @@ impl Lexer<'_> {
 
                 // Direct lookup for 4-letter keywords
                 4 => {
-                    let word_bytes = [
-                        ident_bytes[0],
-                        ident_bytes[1],
-                        ident_bytes[2],
-                        ident_bytes[3],
-                    ];
+                    let word_bytes = unsafe {
+                        [
+                            *ident_bytes.get_unchecked(0),
+                            *ident_bytes.get_unchecked(1),
+                            *ident_bytes.get_unchecked(2),
+                            *ident_bytes.get_unchecked(3),
+                        ]
+                    };
                     let word_value = u32::from_be_bytes(word_bytes);
 
                     for &(keyword_value, token_type) in &KEYWORDS_LEN4 {
@@ -387,16 +397,18 @@ impl Lexer<'_> {
 
                 // Direct lookup for 5-letter keywords
                 5 => {
-                    let word_bytes = [
-                        ident_bytes[0],
-                        ident_bytes[1],
-                        ident_bytes[2],
-                        ident_bytes[3],
-                        ident_bytes[4],
-                        0,
-                        0,
-                        0,
-                    ];
+                    let word_bytes = unsafe {
+                        [
+                            *ident_bytes.get_unchecked(0),
+                            *ident_bytes.get_unchecked(1),
+                            *ident_bytes.get_unchecked(2),
+                            *ident_bytes.get_unchecked(3),
+                            *ident_bytes.get_unchecked(4),
+                            0,
+                            0,
+                            0,
+                        ]
+                    };
                     let word_value = (u64::from_be_bytes(word_bytes)) >> 24;
 
                     for &(keyword_value, token_type) in &KEYWORDS_LEN5 {
@@ -413,16 +425,18 @@ impl Lexer<'_> {
 
                 // Direct lookup for 6-letter keywords
                 6 => {
-                    let word_bytes = [
-                        ident_bytes[0],
-                        ident_bytes[1],
-                        ident_bytes[2],
-                        ident_bytes[3],
-                        ident_bytes[4],
-                        ident_bytes[5],
-                        0,
-                        0,
-                    ];
+                    let word_bytes = unsafe {
+                        [
+                            *ident_bytes.get_unchecked(0),
+                            *ident_bytes.get_unchecked(1),
+                            *ident_bytes.get_unchecked(2),
+                            *ident_bytes.get_unchecked(3),
+                            *ident_bytes.get_unchecked(4),
+                            *ident_bytes.get_unchecked(5),
+                            0,
+                            0,
+                        ]
+                    };
                     let word_value = (u64::from_be_bytes(word_bytes)) >> 16;
 
                     for &(keyword_value, token_type) in &KEYWORDS_LEN6 {
@@ -440,11 +454,16 @@ impl Lexer<'_> {
                 // Fast path for longer keywords using the lookup table
                 7..=16 => {
                     // Get index in KEYWORD_LOOKUP using our index table
-                    let lookup_idx = KEYWORD_INDEX[len - 1][(ident_bytes[0] - b'a') as usize];
+                    let first_char_idx = unsafe { (*ident_bytes.get_unchecked(0) - b'a') as usize };
+                    let lookup_idx = unsafe {
+                        *KEYWORD_INDEX
+                            .get_unchecked(len - 1)
+                            .get_unchecked(first_char_idx)
+                    };
 
                     if lookup_idx != 255 {
                         // Check if the word matches the entry
-                        let entry = &KEYWORD_LOOKUP[lookup_idx as usize];
+                        let entry = unsafe { KEYWORD_LOOKUP.get_unchecked(lookup_idx as usize) };
                         if entry.keyword == ident_str {
                             if let Some(token_type) = entry.token_type {
                                 return Ok(Token::new(
@@ -485,12 +504,12 @@ impl Lexer<'_> {
     /// Super fast check for ASCII identifier start character
     #[inline(always)]
     pub(crate) fn is_ascii_id_start(ch: u8) -> bool {
-        ch < 128 && (IDENT_CHAR[ch as usize] & 1) != 0
+        ch < 128 && unsafe { (IDENT_CHAR.get_unchecked(ch as usize) & 1) != 0 }
     }
 
     /// Super fast check for ASCII identifier continue character  
     #[inline(always)]
     pub(crate) fn is_ascii_id_continue(ch: u8) -> bool {
-        ch < 128 && (IDENT_CHAR[ch as usize] & 2) != 0
+        ch < 128 && unsafe { (IDENT_CHAR.get_unchecked(ch as usize) & 2) != 0 }
     }
 }
diff --git a/crates/swc_ecma_fast_parser/src/lexer/mod.rs b/crates/swc_ecma_fast_parser/src/lexer/mod.rs
@@ -274,7 +274,7 @@ impl<'a> Lexer<'a> {
 
         // Fast path for ASCII tokens using lookup table
         if likely(ch < 128) {
-            let char_type = ASCII_LOOKUP[ch as usize];
+            let char_type = unsafe { *ASCII_LOOKUP.get_unchecked(ch as usize) };
 
             // Fast path for single-character tokens (very common)
             if char_type & CHAR_SPECIAL != 0 {
@@ -420,7 +420,7 @@ impl<'a> Lexer<'a> {
 
             // Handle ASCII characters
             if likely(ch < 128) {
-                let char_type = ASCII_LOOKUP[ch as usize];
+                let char_type = unsafe { *ASCII_LOOKUP.get_unchecked(ch as usize) };
 
                 // Fast path for common whitespace
                 if char_type & CHAR_WHITESPACE != 0 {
@@ -518,7 +518,7 @@ impl<'a> Lexer<'a> {
         // Get current 16 bytes
         let input = self.cursor.rest();
         let mut data = [0u8; 16];
-        data.copy_from_slice(&input[0..16]);
+        data.copy_from_slice(unsafe { input.get_unchecked(0..16) });
         let chunk = u8x16::new(data);
 
         // Compare with our whitespace vectors
@@ -543,20 +543,20 @@ impl<'a> Lexer<'a> {
         // This is more efficient than trying to process the entire chunk at once
         // when we need to handle special cases like CR+LF and comments
 
-        if is_basic_ws_arr[0] != 0 {
+        if unsafe { *is_basic_ws_arr.get_unchecked(0) } != 0 {
             // Regular whitespace - just advance
             self.cursor.advance();
             return true;
         }
 
-        if is_newline_arr[0] != 0 {
+        if unsafe { *is_newline_arr.get_unchecked(0) } != 0 {
             // Newline - need to set had_line_break
             self.cursor.advance();
             self.had_line_break = LineBreak::Present;
             return true;
         }
 
-        if is_cr_arr[0] != 0 {
+        if unsafe { *is_cr_arr.get_unchecked(0) } != 0 {
             // Carriage return - need to check for CRLF sequence
             self.cursor.advance();
             if let Some(b'\n') = self.cursor.peek() {
@@ -566,7 +566,7 @@ impl<'a> Lexer<'a> {
             return true;
         }
 
-        if is_slash_arr[0] != 0 {
+        if unsafe { *is_slash_arr.get_unchecked(0) } != 0 {
             // Potential comment - need to check next character
             if let Some(b'/') | Some(b'*') = self.cursor.peek_at(1) {
                 return false; // Let the caller handle comments