diff --git a/src/data/line.rs b/src/data/line.rs index fd09b4d9..821274c0 100644 --- a/src/data/line.rs +++ b/src/data/line.rs @@ -126,7 +126,7 @@ impl fmt::Debug for LinePart { } impl LinePart { - // XXX: does not handle multi-byte encodings + // XXX: Issue #16 only handles UTF-8/ASCII encoding const _CHARSZ: usize = 1; /// create a new `LinePart`. Remember that `blocki_end` points to one byte past @@ -223,7 +223,7 @@ impl LinePart { #[cfg(any(debug_assertions,test))] pub(self) fn impl_to_String_raw(self: &LinePart, raw: bool) -> String { // XXX: intermixing byte lengths and character lengths - // XXX: does not handle multi-byte + // XXX: Issue #16 only handles UTF-8/ASCII encoding let s1: String; let slice_ = self.as_slice(); if raw { @@ -655,7 +655,7 @@ impl Line { /// /// `raw` false will write transcode each byte to a character and use pictoral representations /// - /// XXX: `raw==false` does not handle multi-byte encodings + // XXX: Issue #16 `raw==false` only handles UTF-8/ASCII encoding #[cfg(any(debug_assertions,test))] pub fn print(self: &Line, raw: bool) { // is this an expensive command? should `stdout` be cached? @@ -674,7 +674,7 @@ impl Line { } } } else { - // XXX: only handle single-byte encodings + // XXX: Issue #16 only handles UTF-8/ASCII encoding // XXX: this is not efficient let s = match std::str::from_utf8(slice) { Ok(val) => val, diff --git a/src/data/sysline.rs b/src/data/sysline.rs index 0039536f..618e7444 100644 --- a/src/data/sysline.rs +++ b/src/data/sysline.rs @@ -127,7 +127,7 @@ impl Sysline { /// default `with_capacity` for a `Lines`, most often will only need 1 capacity /// as the found "sysline" will likely be one `Line` const SYSLINE_PARTS_WITH_CAPACITY: usize = 1; - // XXX: does not handle multi-byte encodings + // XXX: Issue #16 only handles UTF-8/ASCII encoding const CHARSZ: usize = 1; pub fn new() -> Sysline { @@ -263,6 +263,7 @@ impl Sysline { /// get the last byte of this Sysline pub(crate) fn last_byte(self: &Sysline) -> Option { + // XXX: Issue #16 only handles UTF-8/ASCII encoding assert_eq!(self.charsz(), 1, "charsz {} not implemented", self.charsz()); let len_ = self.lines.len(); if len_ <= 0 { @@ -317,7 +318,7 @@ impl Sysline { sz += (*lp).len(); } // XXX: intermixing byte lengths and character lengths - // XXX: does not handle multi-byte + // XXX: Issue #16 only handles UTF-8/ASCII encoding let mut s_ = String::with_capacity(sz + 1); for lp in &self.lines { s_ += (*lp).impl_to_String_raw(raw).as_str(); diff --git a/src/printer_debug/printers.rs b/src/printer_debug/printers.rs index 5a0b6660..488d1323 100644 --- a/src/printer_debug/printers.rs +++ b/src/printer_debug/printers.rs @@ -443,7 +443,7 @@ pub fn pretty_print(buffer: &[u8], raw: bool) { // is this an expensive command? should `stdout` be cached? let stdout: std::io::Stdout = std::io::stdout(); let mut stdout_lock = stdout.lock(); - // XXX: only handle single-byte encodings + // XXX: Issue #16 only handles UTF-8/ASCII encoding // XXX: doing this char by char is probably not efficient //let s = match str::from_utf8_lossy(buffer) { let s = match core::str::from_utf8(buffer) { diff --git a/src/readers/blockreader.rs b/src/readers/blockreader.rs index fc0429cd..0cf7812f 100644 --- a/src/readers/blockreader.rs +++ b/src/readers/blockreader.rs @@ -362,7 +362,7 @@ impl BlockReader { } let path_std: &Path = Path::new(&path); - // TODO: pass in `mimeguess`; avoid repeats of the tedious operation + // TODO: Issue #15 pass in `mimeguess`; avoid repeats of the tedious operation let mimeguess_: MimeGuess = MimeGuess::from_path(path_std); let mut open_options = FileOpenOptions::new(); diff --git a/src/readers/linereader.rs b/src/readers/linereader.rs index e8f834d3..273dc746 100644 --- a/src/readers/linereader.rs +++ b/src/readers/linereader.rs @@ -106,7 +106,7 @@ pub struct LineReader { /// Distinct from `self.lines.len()` as that may have contents removed when --streaming pub (crate) lines_processed: Count, /// smallest size character in bytes - /// TODO: handle char sizes > 1 byte, multi-byte encodings + // XXX: Issue #16 only handles UTF-8/ASCII encoding charsz_: CharSz, /// enable internal LRU cache for `find_line` (default `true`) find_line_lru_cache_enabled: bool, @@ -147,7 +147,7 @@ const CHARSZ_MIN: CharSz = 1; /// maximum char storage size in bytes const CHARSZ_MAX: CharSz = 4; /// default char storage size in bytes -/// XXX: does not handle multi-byte encodings (e.g. UTF-8) or multi-byte character storage (e.g. UTF-32) +// XXX: Issue #16 only handles UTF-8/ASCII encoding const CHARSZ: CharSz = CHARSZ_MIN; /// implement the LineReader things @@ -160,7 +160,7 @@ impl LineReader { pub fn new(path: FPath, filetype: FileType, blocksz: BlockSz) -> Result { dpnxf!("LineReader::new({:?}, {:?}, {:?})", path, filetype, blocksz); - // XXX: multi-byte + // XXX: Issue #16 only handles UTF-8/ASCII encoding assert_ge!( blocksz, (CHARSZ_MIN as BlockSz), @@ -473,7 +473,7 @@ impl LineReader { match self.get_linep(&fileoffset) { Some(linep) => { dpo!("self.get_linep({}) returned @{:p}", fileoffset, linep); - // XXX: does not handle multi-byte + // XXX: Issue #16 only handles UTF-8/ASCII encoding let fo_next: FileOffset = (*linep).fileoffset_end() + charsz_fo; if self.is_line_last(&linep) { if self.find_line_lru_cache_enabled { @@ -622,8 +622,7 @@ impl LineReader { let bi_stop: BlockIndex = bptr_middle.len() as BlockIndex; assert_ge!(bi_stop, charsz_bi, "bi_stop is less than charsz; not yet handled"); - // XXX: multi-byte - //bi_beg = bi_stop - charsz_bi; + // XXX: only handle UTF-8/ASCII encoding dpof!("({}) B1: scan middle block {} forwards, starting from blockindex {} (fileoffset {}) searching for newline B", fileoffset, bo_middle, @@ -631,7 +630,7 @@ impl LineReader { self.file_offset_at_block_offset_index(bo_middle, bi_at) ); loop { - // XXX: single-byte encoding + // XXX: only handle UTF-8/ASCII encoding if (*bptr_middle)[bi_at] == NLu8 { found_nl_b = true; fo_nl_b = self.file_offset_at_block_offset_index(bo_middle, bi_at); @@ -867,7 +866,7 @@ impl LineReader { BI_STOP, ); loop { - // XXX: single-byte encoding + // XXX: Issue #16 only handles UTF-8/ASCII encoding if (*bptr_middle)[bi_at] == NLu8 { found_nl_a = true; fo_nl_a = self.file_offset_at_block_offset_index(bo_middle, bi_at); @@ -880,7 +879,7 @@ impl LineReader { byte_to_char_noraw((*bptr_middle)[bi_at]), ); // adjust offsets one forward - // XXX: single-byte encoding + // XXX: Issue #16 only handles UTF-8/ASCII encoding fo_nl_a1 = fo_nl_a + charsz_fo; bi_at += charsz_bi; break; @@ -888,7 +887,7 @@ impl LineReader { if bi_at == 0 { break; } - // XXX: single-byte encoding + // XXX: Issue #16 only handles UTF-8/ASCII encoding bi_at -= charsz_bi; if bi_at < BI_STOP { break; @@ -982,8 +981,6 @@ impl LineReader { /// fine_line(2) -> 2,2 "y" /// ``` /// - /// XXX: presumes a single-byte can represent a '\n'; i.e. does not handle UTF-16 or UTF-32 or other. - /// /// XXX: returning the "next fileoffset (along with `LineP`) is jenky. Just return the `LineP`. /// and/or add `iter` capabilities to `Line` that will hide tracking the "next fileoffset". /// @@ -992,7 +989,7 @@ impl LineReader { /// Changes require extensive retesting. /// You've been warned. /// - // TODO: [2021/08/30] handle different encodings + // XXX: Issue #16 only handles UTF-8/ASCII encoding pub fn find_line(&mut self, fileoffset: FileOffset) -> ResultS4LineFind { dpnf!("(LineReader@{:p}, {})", self, fileoffset); @@ -1091,11 +1088,11 @@ impl LineReader { let mut bi_at: BlockIndex = bi_middle; let bi_stop: BlockIndex = bptr_middle.len() as BlockIndex; assert_ge!(bi_stop, charsz_bi, "bi_stop is less than charsz; not yet handled"); - // XXX: multi-byte + // XXX: Issue #16 only handles UTF-8/ASCII encoding //bi_beg = bi_stop - charsz_bi; dpof!("B1: scan middle block {} forwards (block len {}), starting from blockindex {} (fileoffset {}) searching for newline B", bo_middle, (*bptr_middle).len(), bi_at, self.file_offset_at_block_offset_index(bo_middle, bi_at)); loop { - // XXX: single-byte encoding + // XXX: Issue #16 only handles UTF-8/ASCII encoding if (*bptr_middle)[bi_at] == NLu8 { found_nl_b = true; fo_nl_b = self.file_offset_at_block_offset_index(bo_middle, bi_at); @@ -1175,7 +1172,7 @@ impl LineReader { bi_end = (*bptr).len() as BlockIndex; assert_ge!(bi_end, charsz_bi, "blockindex bi_end {} is less than charsz; not yet handled, file {:?}", bi_end, self.path()); assert_ne!(bi_end, 0, "blockindex bi_end is zero; Block at blockoffset {}, BlockP @0x{:p}, has len() zero", bof, bptr); - // XXX: multi-byte + // XXX: Issue #16 only handles UTF-8/ASCII encoding //bi_beg = bi_end - charsz_bi; dpof!( "B2: scan block {} forwards, starting from blockindex {} (fileoffset {}) up to blockindex {} searching for newline B", @@ -1185,7 +1182,7 @@ impl LineReader { bi_end, ); loop { - // XXX: single-byte encoding + // XXX: Issue #16 only handles UTF-8/ASCII encoding if (*bptr)[bi_beg] == NLu8 { found_nl_b = true; fo_nl_b = self.file_offset_at_block_offset_index(bof, bi_beg); @@ -1413,7 +1410,7 @@ impl LineReader { bo_middle, bi_at, self.file_offset_at_block_offset_index(bo_middle, bi_at), BI_STOP, ); loop { - // XXX: single-byte encoding + // XXX: Issue #16 only handles UTF-8/ASCII encoding if (*bptr_middle)[bi_at] == NLu8 { found_nl_a = true; fo_nl_a = self.file_offset_at_block_offset_index(bo_middle, bi_at); @@ -1425,7 +1422,7 @@ impl LineReader { byte_to_char_noraw((*bptr_middle)[bi_at]), ); // adjust offsets one forward - // XXX: single-byte encoding + // XXX: Issue #16 only handles UTF-8/ASCII encoding fo_nl_a1 = fo_nl_a + charsz_fo; bi_at += charsz_bi; break; @@ -1433,7 +1430,7 @@ impl LineReader { if bi_at == 0 { break; } - // XXX: single-byte encoding + // XXX: Issue #16 only handles UTF-8/ASCII encoding bi_at -= charsz_bi; if bi_at < BI_STOP { break; @@ -1523,7 +1520,7 @@ impl LineReader { bof, bi_at, self.file_offset_at_block_offset_index(bof, bi_at), BI_STOP, ); loop { - // XXX: single-byte encoding + // XXX: Issue #16 only handles UTF-8/ASCII encoding if (*bptr)[bi_at] == NLu8 { found_nl_a = true; fo_nl_a = self.file_offset_at_block_offset_index(bof, bi_at); @@ -1535,7 +1532,7 @@ impl LineReader { byte_to_char_noraw((*bptr)[bi_at]), ); // adjust offsets one forward - // XXX: single-byte encoding + // XXX: Issue #16 only handles UTF-8/ASCII encoding fo_nl_a1 = fo_nl_a + charsz_fo; bi_at += charsz_bi; let bof_a1 = self.block_offset_at_file_offset(fo_nl_a1); @@ -1573,7 +1570,7 @@ impl LineReader { if bi_at == 0 { break; } - // XXX: single-byte encoding + // XXX: Issue #16 only handles UTF-8/ASCII encoding bi_at -= charsz_bi; if bi_at < BI_STOP { break; diff --git a/src/readers/syslinereader.rs b/src/readers/syslinereader.rs index 9afef006..e7ffa230 100644 --- a/src/readers/syslinereader.rs +++ b/src/readers/syslinereader.rs @@ -623,7 +623,7 @@ impl SyslineReader { dpnf!("syslines.insert({}, Sysline @[{}, {}] datetime: {:?})", fo_beg, (*syslinep).fileoffset_begin(), (*syslinep).fileoffset_end(), (*syslinep).dt()); self.syslines.insert(fo_beg, syslinep.clone()); self.syslines_count += 1; - // XXX: multi-byte character + // XXX: Issue #16 only handles UTF-8/ASCII encoding let fo_end1: FileOffset = fo_end + (self.charsz() as FileOffset); dpxf!("syslines_by_range.insert(({}‥{}], {})", fo_beg, fo_end1, fo_beg); self.syslines_by_range.insert(fo_beg..fo_end1, fo_beg); @@ -734,7 +734,7 @@ impl SyslineReader { dpof!("line too short {} for requested start {}; continue", line.len(), dtpd.range_regex.start); continue; } - // XXX: does not support multi-byte string; assumes single-byte + // XXX: Issue #16 only handles UTF-8/ASCII encoding let slice_end: usize; if line.len() > dtpd.range_regex.end { slice_end = dtpd.range_regex.end; @@ -1081,7 +1081,7 @@ impl SyslineReader { self.syslines_by_range_hit += 1; let fo: &FileOffset = range_fo.1; let syslinep: SyslineP = self.syslines[fo].clone(); - // XXX: multi-byte character encoding + // XXX: Issue #16 only handles UTF-8/ASCII encoding let fo_next: FileOffset = (*syslinep).fileoffset_next() + (self.charsz() as FileOffset); if self.is_sysline_last(&syslinep) { dpxf!( @@ -1123,7 +1123,7 @@ impl SyslineReader { self.syslines_hit += 1; dpof!("hit self.syslines for FileOffset {}", fileoffset); let syslinep: SyslineP = self.syslines[&fileoffset].clone(); - // XXX: multi-byte character encoding + // XXX: Issue #16 only handles UTF-8/ASCII encoding let fo_next: FileOffset = (*syslinep).fileoffset_end() + (self.charsz() as FileOffset); if self.is_sysline_last(&syslinep) { dpof!( @@ -1502,7 +1502,7 @@ impl SyslineReader { debug_assert!(self.syslines_by_range.contains_key(&fo1), "self.syslines.contains_key({}) however, self.syslines_by_range.contains_key({}); syslines_by_range out of synch", fo1, fo1); dpo!("find_sysline: hit self.syslines for FileOffset {}", fo1); let syslinep = self.syslines[&fo1].clone(); - // XXX: multi-byte character encoding + // XXX: Issue #16 only handles UTF-8/ASCII encoding let fo_next = (*syslinep).fileoffset_end() + (self.charsz() as FileOffset); // TODO: determine if `fileoffset` is the last sysline of the file // should add a private helper function for this task `is_sysline_last(FileOffset)` ... something like that @@ -1535,7 +1535,7 @@ impl SyslineReader { self.syslines_by_range_hit += 1; let fo = range_fo.1; let syslinep = self.syslines[fo].clone(); - // XXX: multi-byte character encoding + // XXX: Issue #16 only handles UTF-8/ASCII encoding let fo_next = (*syslinep).fileoffset_next() + (self.charsz() as FileOffset); if self.find_sysline_lru_cache_enabled { self.find_sysline_lru_cache_put += 1;