diff --git a/Cargo.lock b/Cargo.lock index 26064c8afcbe..72e508955389 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -214,6 +214,12 @@ dependencies = [ "tempfile", ] +[[package]] +name = "assume" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d6f9ca11400f14ef046700eb6401c706c587871303453a5e7586efb82340c3d" + [[package]] name = "ast_node" version = "3.0.0" @@ -5303,6 +5309,7 @@ dependencies = [ name = "swc_ecma_fast_parser" version = "1.0.0" dependencies = [ + "assume", "codspeed-criterion-compat", "criterion", "num-bigint", diff --git a/Cargo.toml b/Cargo.toml index 12512f85fae7..8b93cac7872a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ resolver = "2" ascii = "1.1.0" assert_cmd = "2.0.12" assert_fs = "1.0.13" + assume = "0.5.0" auto_impl = "1.2.0" backtrace = "0.3.61" base64 = "0.22.1" diff --git a/crates/swc_ecma_fast_parser/Cargo.toml b/crates/swc_ecma_fast_parser/Cargo.toml index 4b1afb3c69d9..853199eb7235 100644 --- a/crates/swc_ecma_fast_parser/Cargo.toml +++ b/crates/swc_ecma_fast_parser/Cargo.toml @@ -18,6 +18,7 @@ swc_atoms = { version = "5.0.0", path = "../swc_atoms" } swc_common = { version = "8.0.0", path = "../swc_common" } swc_ecma_ast = { version = "8.0.0", path = "../swc_ecma_ast" } +assume = { workspace = true } num-bigint = { workspace = true } phf = { workspace = true, features = ["macros"] } wide = { workspace = true } diff --git a/crates/swc_ecma_fast_parser/src/lexer/cursor.rs b/crates/swc_ecma_fast_parser/src/lexer/cursor.rs index bad78e78a0b0..e141dbfa0ada 100644 --- a/crates/swc_ecma_fast_parser/src/lexer/cursor.rs +++ b/crates/swc_ecma_fast_parser/src/lexer/cursor.rs @@ -2,10 +2,11 @@ //! //! This cursor operates directly on UTF-8 bytes for maximum performance. +use assume::assume; use swc_common::BytePos; use wide::u8x16; -use crate::util::{likely, unlikely}; +use crate::util::unlikely; /// High-performance cursor for traversing input bytes #[repr(C)] // Ensure predictable memory layout for better cache behavior @@ -78,15 +79,15 @@ impl<'a> Cursor<'a> { /// Advance the cursor by one byte #[inline(always)] pub fn advance(&mut self) { - if likely(!self.is_eof()) { - self.pos += 1; - } + assume!(unsafe: !self.is_eof()); + self.pos += 1; } /// Advance the cursor by n bytes #[inline(always)] pub fn advance_n(&mut self, n: u32) { - self.pos = (self.pos + n).min(self.len); + assume!(unsafe: self.pos + n <= self.len); + self.pos += n; } /// Advance until the predicate returns false or EOF is reached diff --git a/crates/swc_ecma_fast_parser/src/lexer/identifier.rs b/crates/swc_ecma_fast_parser/src/lexer/identifier.rs index f72eb6b22e3e..d542317ac2c6 100644 --- a/crates/swc_ecma_fast_parser/src/lexer/identifier.rs +++ b/crates/swc_ecma_fast_parser/src/lexer/identifier.rs @@ -331,7 +331,8 @@ impl Lexer<'_> { match len { // Direct lookup for 2-letter keywords 2 => { - let word_bytes = [ident_bytes[0], ident_bytes[1]]; + let word_bytes = + unsafe { [*ident_bytes.get_unchecked(0), *ident_bytes.get_unchecked(1)] }; let word_value = u16::from_be_bytes(word_bytes); for &(keyword_value, token_type) in &KEYWORDS_LEN2 { @@ -348,7 +349,14 @@ impl Lexer<'_> { // Direct lookup for 3-letter keywords 3 => { - let word_bytes = [ident_bytes[0], ident_bytes[1], ident_bytes[2], 0]; + let word_bytes = unsafe { + [ + *ident_bytes.get_unchecked(0), + *ident_bytes.get_unchecked(1), + *ident_bytes.get_unchecked(2), + 0, + ] + }; let word_value = (u32::from_be_bytes(word_bytes)) >> 8; for &(keyword_value, token_type) in &KEYWORDS_LEN3 { @@ -365,12 +373,14 @@ impl Lexer<'_> { // Direct lookup for 4-letter keywords 4 => { - let word_bytes = [ - ident_bytes[0], - ident_bytes[1], - ident_bytes[2], - ident_bytes[3], - ]; + let word_bytes = unsafe { + [ + *ident_bytes.get_unchecked(0), + *ident_bytes.get_unchecked(1), + *ident_bytes.get_unchecked(2), + *ident_bytes.get_unchecked(3), + ] + }; let word_value = u32::from_be_bytes(word_bytes); for &(keyword_value, token_type) in &KEYWORDS_LEN4 { @@ -387,16 +397,18 @@ impl Lexer<'_> { // Direct lookup for 5-letter keywords 5 => { - let word_bytes = [ - ident_bytes[0], - ident_bytes[1], - ident_bytes[2], - ident_bytes[3], - ident_bytes[4], - 0, - 0, - 0, - ]; + let word_bytes = unsafe { + [ + *ident_bytes.get_unchecked(0), + *ident_bytes.get_unchecked(1), + *ident_bytes.get_unchecked(2), + *ident_bytes.get_unchecked(3), + *ident_bytes.get_unchecked(4), + 0, + 0, + 0, + ] + }; let word_value = (u64::from_be_bytes(word_bytes)) >> 24; for &(keyword_value, token_type) in &KEYWORDS_LEN5 { @@ -413,16 +425,18 @@ impl Lexer<'_> { // Direct lookup for 6-letter keywords 6 => { - let word_bytes = [ - ident_bytes[0], - ident_bytes[1], - ident_bytes[2], - ident_bytes[3], - ident_bytes[4], - ident_bytes[5], - 0, - 0, - ]; + let word_bytes = unsafe { + [ + *ident_bytes.get_unchecked(0), + *ident_bytes.get_unchecked(1), + *ident_bytes.get_unchecked(2), + *ident_bytes.get_unchecked(3), + *ident_bytes.get_unchecked(4), + *ident_bytes.get_unchecked(5), + 0, + 0, + ] + }; let word_value = (u64::from_be_bytes(word_bytes)) >> 16; for &(keyword_value, token_type) in &KEYWORDS_LEN6 { @@ -440,11 +454,16 @@ impl Lexer<'_> { // Fast path for longer keywords using the lookup table 7..=16 => { // Get index in KEYWORD_LOOKUP using our index table - let lookup_idx = KEYWORD_INDEX[len - 1][(ident_bytes[0] - b'a') as usize]; + let first_char_idx = unsafe { (*ident_bytes.get_unchecked(0) - b'a') as usize }; + let lookup_idx = unsafe { + *KEYWORD_INDEX + .get_unchecked(len - 1) + .get_unchecked(first_char_idx) + }; if lookup_idx != 255 { // Check if the word matches the entry - let entry = &KEYWORD_LOOKUP[lookup_idx as usize]; + let entry = unsafe { KEYWORD_LOOKUP.get_unchecked(lookup_idx as usize) }; if entry.keyword == ident_str { if let Some(token_type) = entry.token_type { return Ok(Token::new( @@ -485,12 +504,12 @@ impl Lexer<'_> { /// Super fast check for ASCII identifier start character #[inline(always)] pub(crate) fn is_ascii_id_start(ch: u8) -> bool { - ch < 128 && (IDENT_CHAR[ch as usize] & 1) != 0 + ch < 128 && unsafe { (IDENT_CHAR.get_unchecked(ch as usize) & 1) != 0 } } /// Super fast check for ASCII identifier continue character #[inline(always)] pub(crate) fn is_ascii_id_continue(ch: u8) -> bool { - ch < 128 && (IDENT_CHAR[ch as usize] & 2) != 0 + ch < 128 && unsafe { (IDENT_CHAR.get_unchecked(ch as usize) & 2) != 0 } } } diff --git a/crates/swc_ecma_fast_parser/src/lexer/mod.rs b/crates/swc_ecma_fast_parser/src/lexer/mod.rs index 06acf5369bac..892d081bea30 100644 --- a/crates/swc_ecma_fast_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_fast_parser/src/lexer/mod.rs @@ -274,7 +274,7 @@ impl<'a> Lexer<'a> { // Fast path for ASCII tokens using lookup table if likely(ch < 128) { - let char_type = ASCII_LOOKUP[ch as usize]; + let char_type = unsafe { *ASCII_LOOKUP.get_unchecked(ch as usize) }; // Fast path for single-character tokens (very common) if char_type & CHAR_SPECIAL != 0 { @@ -420,7 +420,7 @@ impl<'a> Lexer<'a> { // Handle ASCII characters if likely(ch < 128) { - let char_type = ASCII_LOOKUP[ch as usize]; + let char_type = unsafe { *ASCII_LOOKUP.get_unchecked(ch as usize) }; // Fast path for common whitespace if char_type & CHAR_WHITESPACE != 0 { @@ -518,7 +518,7 @@ impl<'a> Lexer<'a> { // Get current 16 bytes let input = self.cursor.rest(); let mut data = [0u8; 16]; - data.copy_from_slice(&input[0..16]); + data.copy_from_slice(unsafe { input.get_unchecked(0..16) }); let chunk = u8x16::new(data); // Compare with our whitespace vectors @@ -543,20 +543,20 @@ impl<'a> Lexer<'a> { // This is more efficient than trying to process the entire chunk at once // when we need to handle special cases like CR+LF and comments - if is_basic_ws_arr[0] != 0 { + if unsafe { *is_basic_ws_arr.get_unchecked(0) } != 0 { // Regular whitespace - just advance self.cursor.advance(); return true; } - if is_newline_arr[0] != 0 { + if unsafe { *is_newline_arr.get_unchecked(0) } != 0 { // Newline - need to set had_line_break self.cursor.advance(); self.had_line_break = LineBreak::Present; return true; } - if is_cr_arr[0] != 0 { + if unsafe { *is_cr_arr.get_unchecked(0) } != 0 { // Carriage return - need to check for CRLF sequence self.cursor.advance(); if let Some(b'\n') = self.cursor.peek() { @@ -566,7 +566,7 @@ impl<'a> Lexer<'a> { return true; } - if is_slash_arr[0] != 0 { + if unsafe { *is_slash_arr.get_unchecked(0) } != 0 { // Potential comment - need to check next character if let Some(b'/') | Some(b'*') = self.cursor.peek_at(1) { return false; // Let the caller handle comments