WIP Regex matches bytes

Regex matching on bytes instead of str. Postpones `u8_to_str` conversions (and potential problems) to last possible moment. WIP Priliminary implementation with much inefficieny. Fixes errors in parsing lines with unicode chars that spanned Block boundaries.
jtmoon79 · Jul 12, 2022 · 3d78b0d · 3d78b0d
1 parent a932d5a
commit 3d78b0d
Show file tree

Hide file tree

Showing 4 changed files with 277 additions and 39 deletions.
diff --git a/src/Data/datetime.rs b/src/Data/datetime.rs
@@ -1832,6 +1832,148 @@ fn captures_to_buffer(
     debug_eprintln!("{}captures_to_buffer buffer {:?}", snx(), buffer);
 }
 
+/// helper to `captures_to_buffer_bytes`
+macro_rules! copy_capturegroup_to_buffer {
+    (
+        $name:ident,
+        $captures:ident,
+        $buffer:ident,
+        $at:ident
+    ) => {
+        let len_: usize = $captures.name($name).as_ref().unwrap().as_bytes().len();
+        debug_eprintln!("{}bytes_to_regex_to_datetime:copy_capturegroup_to_buffer! buffer[{:?}..{:?}]", so(), $at, $at+len_);
+        $buffer[$at..$at+len_].copy_from_slice($captures.name($name).as_ref().unwrap().as_bytes());
+        $at += len_;
+    }
+}
+
+/// helper to `captures_to_buffer_bytes`
+macro_rules! copy_slice_to_buffer {
+    (
+        $u8_slice:expr,
+        $buffer:ident,
+        $at:ident
+    ) => {
+        let len_: usize = $u8_slice.len();
+        debug_eprintln!("{}bytes_to_regex_to_datetime:copy_slice_to_buffer! buffer[{:?}..{:?}]", so(), $at, $at+len_);
+        $buffer[$at..$at+len_].copy_from_slice($u8_slice);
+        $at += len_;
+    }
+}
+
+
+/// helper to `captures_to_buffer_bytes`
+macro_rules! copy_u8_to_buffer {
+    (
+        $u8_:expr,
+        $buffer:ident,
+        $at:ident
+    ) => {
+        debug_eprintln!("{}bytes_to_regex_to_datetime:copy_slice_to_buffer! buffer[{:?}] = {:?}", so(), $at, $u8_);
+        $buffer[$at] = $u8_;
+        $at += 1;
+    }
+}
+
+/// Put `Captures` into a `String` buffer in a particular order and formatting. This bridges the
+/// `DateTime_Parse_Data::regex_pattern` to `DateTime_Parse_Data::dt_pattern`.
+///
+/// Directly relates to datetime format `dt_pattern` values in `DATETIME_PARSE_DATAS`
+/// which use `DTFSS_YmdHMS`, etc.
+#[inline(always)]
+fn captures_to_buffer_bytes(
+    buffer: &mut[u8],
+    captures: &regex::bytes::Captures,
+    tz_offset: &FixedOffset,
+    dtfs: &DTFSSet,
+) -> usize {
+    debug_eprintln!("{}captures_to_buffer_bytes", sn());
+    let mut at: usize = 0;
+    // year
+    match captures.name(CGN_YEAR).as_ref() {
+        Some(match_) => {
+            copy_slice_to_buffer!(match_.as_bytes(), buffer, at);
+        }
+        None => {
+            // TODO: 2022/06/27 do something smarter than setting current year
+            // TODO: 2022/07/11 cost-savings: inefficient to create datetime and string each call; create the year str once elsewhere, pass in to this fn as `Option<String>`
+            let localb = Local::today().year().to_string();
+            copy_slice_to_buffer!(localb.as_bytes(), buffer, at);
+        }
+    }
+    // month
+    copy_capturegroup_to_buffer!(CGN_MONTH, captures, buffer, at);
+    // day
+    match dtfs.day {
+        DTFS_Day::d | DTFS_Day::e => {
+            copy_capturegroup_to_buffer!(CGN_DAY, captures, buffer, at);
+        },
+        DTFS_Day::_de_to_d => {
+            let day: &[u8] = captures.name(CGN_DAY).as_ref().unwrap().as_bytes();
+            debug_assert_eq!(day.len(), 2, "bad named group 'day' data {:?}, expected data of len 2", day);
+            const SPACE_: u8 = ' ' as u8;
+            match day[0] {
+                // change day " 8" to "08"
+                SPACE_ => {
+                    copy_u8_to_buffer!('0' as u8, buffer, at);
+                    copy_u8_to_buffer!(day[1], buffer, at);
+                }
+                _ => {
+                    copy_slice_to_buffer!(day, buffer, at);
+                }
+            }
+        }
+    }
+    copy_u8_to_buffer!('T' as u8, buffer, at);
+    // hour
+    copy_capturegroup_to_buffer!(CGN_HOUR, captures, buffer, at);
+    // minute
+    copy_capturegroup_to_buffer!(CGN_MINUTE, captures, buffer, at);
+    // second
+    copy_capturegroup_to_buffer!(CGN_SECOND, captures, buffer, at);
+    // fractional
+    match dtfs.fractional {
+        DTFS_Fractional::f => {
+            copy_u8_to_buffer!('.' as u8, buffer, at);
+            copy_capturegroup_to_buffer!(CGN_FRACTIONAL, captures, buffer, at);
+        }
+        DTFS_Fractional::_none => {}
+    }
+    // tz
+    match dtfs.tz {
+        DTFS_Tz::_fill => {
+            // TODO: cost-savings: pass pre-created TZ `&str`
+            let tzs: String = tz_offset.to_string();
+            copy_slice_to_buffer!(tzs.as_bytes(), buffer, at);
+        }
+        DTFS_Tz::z | DTFS_Tz::cz | DTFS_Tz::pz => {
+            copy_capturegroup_to_buffer!(CGN_TZ, captures, buffer, at);
+        }
+        DTFS_Tz::Z => {
+            #[allow(non_snake_case)]
+            let tzZ: &str = u8_to_str(captures.name(CGN_TZ).as_ref().unwrap().as_bytes()).unwrap();
+            match MAP_TZZ_TO_TZz.get_key_value(tzZ) {
+                Some((_tz_abbr, tz_offset_)) => {
+                    // TODO: cost-savings: pre-create the `tz_offset` entries as bytes
+                    let tzs: String = tz_offset_.to_string();
+                    copy_slice_to_buffer!(tzs.as_bytes(), buffer, at);
+                }
+                None => {
+                    // cannot find entry in MAP_TZZ_TO_TZz, fill with passed TZ
+                    // TODO: cost-savings: pre-create the `tz_offset` entries as bytes
+                    let tzs: String = tz_offset.to_string();
+                    copy_slice_to_buffer!(tzs.as_bytes(), buffer, at);
+                }
+            }
+
+        }
+    }
+
+    debug_eprintln!("{}captures_to_buffer_bytes return {:?}", sx(), at);
+
+    at
+}
+
 /// run `regex::Captures` on the `data` then convert to a chrono
 /// `Option<DateTime<FixedOffset>>` instance. Uses matching and pattern information
 /// hardcoded in `DATETIME_PARSE_DATAS_REGEX` and `DATETIME_PARSE_DATAS`.
@@ -1927,6 +2069,107 @@ pub fn str_to_regex_to_datetime(
     Some((dt_beg, dt_end, dt))
 }
 
+/// run `regex::Captures` on the `data` then convert to a chrono
+/// `Option<DateTime<FixedOffset>>` instance. Uses matching and pattern information
+/// hardcoded in `DATETIME_PARSE_DATAS_REGEX` and `DATETIME_PARSE_DATAS`.
+pub fn bytes_to_regex_to_datetime(
+    data: &[u8],
+    index: &DateTime_Parse_Datas_Index,
+    tz_offset: &FixedOffset,
+) -> Option<CapturedDtData> {
+    debug_eprintln!("{}bytes_to_regex_to_datetime({:?}, {:?}, {:?})", sn(), data, index, tz_offset);
+
+    let regex_: &Regex = match DATETIME_PARSE_DATAS_REGEX_VEC.get(*index) {
+        Some(val) => val,
+        None => {
+            panic!("requested DATETIME_PARSE_DATAS_REGEX_VEC.get({}), returned None. DATETIME_PARSE_DATAS_REGEX_VEC.len() {}", index, DATETIME_PARSE_DATAS_REGEX_VEC.len());
+        }
+    };
+
+    // shadow `regex_` with bytes-based regex
+    // TODO: cost-savings: precreate the `regex::bytes` instance
+    let regex_: regex::bytes::Regex = regex::bytes::Regex::new(regex_.as_str()).unwrap();
+    let captures: regex::bytes::Captures = match regex_.captures(data) {
+        None => {
+            debug_eprintln!("{}bytes_to_regex_to_datetime: regex: no captures (returned None)", sx());
+            return None;
+        }
+        Some(captures) => {
+            debug_eprintln!("{}bytes_to_regex_to_datetime: regex: captures.len() {}", so(), captures.len());
+
+            captures
+        }
+    };
+    if cfg!(debug_assertions) {
+        for (i, name_opt) in regex_.capture_names().enumerate() {
+            let match_: regex::bytes::Match = match captures.get(i) {
+                Some(m_) => m_,
+                None => {
+                    match name_opt {
+                        Some(name) => {
+                            eprintln!("{}bytes_to_regex_to_datetime: regex captures: {:2} {:<20} None", so(), i, name);
+                        },
+                        None => {
+                            eprintln!("{}bytes_to_regex_to_datetime: regex captures: {:2} {:<20} None", so(), i, "None");
+                        }
+                    }
+                    continue;
+                }
+            };
+            match name_opt {
+                Some(name) => {
+                    eprintln!("{}bytes_to_regex_to_datetime: regex captures: {:2} {:<20} {:?}", so(), i, name, match_.as_bytes());
+                },
+                None => {
+                    eprintln!("{}bytes_to_regex_to_datetime: regex captures: {:2} {:<20} {:?}", so(), i, "NO NAME", match_.as_bytes());
+                }
+            }
+
+        }
+    }
+    // sanity check
+    debug_assert!(!captures.iter().any(|x| x.is_none()), "a match in the regex::Captures was None");
+
+    let dtpd: &DateTime_Parse_Data = &DATETIME_PARSE_DATAS[*index];
+    // copy regex matches into a buffer with predictable ordering
+    // this ordering relates to datetime format strings in `DATETIME_PARSE_DATAS`
+    // TODO: [2022/06/26] cost-savings: avoid a `String` alloc by passing precreated buffer
+    const BUFLEN: usize = 35;
+    let mut buffer: [u8; BUFLEN] = [0; BUFLEN];
+    let copiedn = captures_to_buffer_bytes(&mut buffer, &captures, tz_offset, &dtpd.dtfs);
+
+    // use the `dt_format` to parse the buffer of regex matches
+    let buffer_s: &str = u8_to_str(&buffer[0..copiedn]).unwrap();
+    let dt = match datetime_parse_from_str(
+        buffer_s,
+        dtpd.dtfs.pattern,
+        dtpd.dtfs.has_year(),
+        dtpd.dtfs.has_tz(),
+        tz_offset,
+    ) {
+        Some(dt_) => dt_,
+        None => {
+            debug_eprintln!("{}bytes_to_regex_to_datetime return None", sx());
+            return None;
+        }
+    };
+
+    // derive the `LineIndex` bounds of the datetime substring within `data`
+    // TODO: cost-savings: only track dt_first dt_last if using `--color`
+    let dt_beg: LineIndex = match captures.name(dtpd.cgn_first) {
+        Some(match_) => match_.start() as LineIndex,
+        None => 0,
+    };
+    let dt_end: LineIndex = match captures.name(dtpd.cgn_last) {
+        Some(match_) => match_.end() as LineIndex,
+        None => 0,
+    };
+    debug_assert_lt!(dt_beg, dt_end, "bad dt_beg {} dt_end {}, index {}", dt_beg, dt_end, index);
+
+    debug_eprintln!("{}bytes_to_regex_to_datetime: return Some({:?}, {:?}, {:?})", sx(), dt_beg, dt_end, dt);
+    Some((dt_beg, dt_end, dt))
+}
+
 // ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 // DateTime comparisons
 // ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

diff --git a/src/Data/line.rs b/src/Data/line.rs
@@ -90,9 +90,9 @@ pub struct LinePart {
     /// used as-is in slice notation
     pub blocki_end: BlockIndex,
     /// the byte offset into the file where this `LinePart` begins
-    pub fileoffset: FileOffset,
+    fileoffset: FileOffset,
     /// blockoffset: debug helper, might be good to get rid of this?
-    pub blockoffset: BlockOffset,
+    blockoffset: BlockOffset,
     /// the file-designated BlockSz, _not_ necessarily the `len()` of the `Block` at `blockp`
     ///
     /// TODO: is this used?
@@ -170,16 +170,28 @@ impl LinePart {
         }
     }
 
-    /// length of line starting at index `blocki_beg`
+    /// length of line starting at index `blocki_beg` in bytes
     pub fn len(&self) -> usize {
         (self.blocki_end - self.blocki_beg) as usize
     }
 
-    /// since there is `fn len()`, function `is_empty` was recommended by clippy
+    /// clippy recommends `fn is_empty` since there is already `len()`
     pub fn is_empty(&self) -> bool {
         self.len() == 0
     }
 
+    pub fn fileoffset_begin(&self) -> FileOffset {
+        self.fileoffset
+    }
+
+    pub fn fileoffset_end(&self) -> FileOffset {
+        self.fileoffset + (self.blocki_end as FileOffset)
+    }
+
+    pub fn blockoffset(&self) -> BlockOffset {
+        self.blockoffset
+    }
+
     /// count of bytes of this `LinePart`
     /// XXX: `count_bytes` and `len` is overlapping and confusing.
     ///
@@ -589,13 +601,13 @@ impl Line {
                 a_found = true;
                 b_search = true;
                 if b < len_ {
-                    debug_eprintln!("{}get_boxptrs: ptrs.push(linepart.block_boxptr_ab({}, {}))", so(), a, b);
+                    debug_eprintln!("{}get_boxptrs: ptrs.push(linepart.block_boxptr_ab({}, {})) @Block[{:?}..{:?}] @[{:?}..{:?}]", so(), a, b, linepart.blocki_beg, linepart.blocki_end, linepart.fileoffset_begin(), linepart.fileoffset_end());
                     ptrs.push(linepart.block_boxptr_ab(&a, &b));  // store [a..b]  (entire slice, entire `Line`)
                     debug_assert_gt!(ptrs.len(), 1, "ptrs is {} elements, expected >= 1; this should have been handled earlier", ptrs.len());
                     debug_eprintln!("{}get_boxptrs: return MultiPtr {} ptrs", sx(), ptrs.len());
                     return LinePartPtrs::MultiPtr(ptrs);
                 }
-                debug_eprintln!("{}get_boxptrs: ptrs.push(linepart.block_boxptr_a({}))", so(), a);
+                debug_eprintln!("{}get_boxptrs: ptrs.push(linepart.block_boxptr_a({})) @Block[{:?}..{:?}] @[{:?}..{:?}]", so(), a, linepart.blocki_beg, linepart.blocki_end, linepart.fileoffset_begin(), linepart.fileoffset_end());
                 ptrs.push(linepart.block_boxptr_a(&a));  // store [a..]  (first slice of `Line`)
                 b -= len_;
                 continue;
@@ -605,32 +617,18 @@ impl Line {
                 continue;
             }
             if b_search && b < len_ {
-                debug_eprintln!("{}get_boxptrs: ptrs.push(linepart.block_boxptr_b({}))", so(), b);
+                debug_eprintln!("{}get_boxptrs: ptrs.push(linepart.block_boxptr_b({})) @Block[{:?}..{:?}] @[{:?}..{:?}]", so(), b, linepart.blocki_beg, linepart.blocki_end, linepart.fileoffset_begin(), linepart.fileoffset_end());
                 ptrs.push(linepart.block_boxptr_b(&b));  // store [..b] (last slice of `Line`)
                 break;
             } else  {
-                debug_eprintln!("{}get_boxptrs: ptrs.push(linepart.block_boxptr())", so());
+                debug_eprintln!("{}get_boxptrs: ptrs.push(linepart.block_boxptr()) @Block[{:?}..{:?}] @[{:?}..{:?}]", so(), linepart.blocki_beg, linepart.blocki_end, linepart.fileoffset_begin(), linepart.fileoffset_end());
                 ptrs.push(linepart.block_boxptr());  // store [..] (entire slice, middle part of `Line`)
                 b -= len_;
             }
         }
-        // TODO: get rid of this
-        match ptrs.len() {
-            1 => {
-                debug_eprintln!("{}get_boxptrs: return SinglePtr (TODO: no need to alloc Vec)", sx());
+        debug_assert_gt!(ptrs.len(), 1, "Ptrs is length {}, expected >1; parsing algorithm missed this case", ptrs.len());
 
-                LinePartPtrs::SinglePtr(ptrs.pop().unwrap())
-            }
-            0 => {
-                // `a, b` that are past the end of the `Line` return `NoPtr`
-                LinePartPtrs::NoPtr
-            }
-            _ => {
-                debug_eprintln!("{}get_boxptrs: return MultiPtr {} ptrs", sx(), ptrs.len());
-
-                LinePartPtrs::MultiPtr(ptrs)
-            }
-        }
+        LinePartPtrs::MultiPtr(ptrs)
     }
 
     /// `raw` true will write directly to stdout from the stored `Block`
@@ -715,10 +713,10 @@ impl Line {
             let s3 = match std::str::from_utf8(s2) {
                 Ok(val) => val,
                 Err(err) => {
-                    let fo1 = self.fileoffset_begin() + (linepart.blocki_beg as FileOffset);
-                    let fo2 = self.fileoffset_begin() + (linepart.blocki_end as FileOffset);
-                    eprintln!("ERROR: failed to convert [u8] at FileOffset[{}‥{}] to utf8 str; {}", fo1, fo2, err);
-                    continue;
+                    eprintln!("ERROR: failed to convert [u8] at LinePart@FileOffset[{}‥{}] to utf8 str, using from_utf8_unchecked; {}", linepart.fileoffset_begin(), linepart.fileoffset_end(), err);
+                    unsafe {
+                        std::str::from_utf8_unchecked(s2)
+                    }
                 }
             };
             s1.push_str(s3);

diff --git a/src/Readers/linereader.rs b/src/Readers/linereader.rs
@@ -394,7 +394,7 @@ impl LineReader {
                 debug_eprintln!("{}linereader.drop_line: Arc::try_unwrap(linep) processing Line @[{}‥{}] Block @[{}‥{}]", sn(), line.fileoffset_begin(), line.fileoffset_end(), line.blockoffset_first(), line.blockoffset_last());
                 self.drop_line_ok += 1;
                 for linepart in line.lineparts.into_iter() {
-                    self.blockreader.drop_block(linepart.blockoffset, bo_dropped);
+                    self.blockreader.drop_block(linepart.blockoffset(), bo_dropped);
                 }
             }
             Err(_linep) => {