From a5e2ebaac545df3b0b102ffa474306b6077c72a3 Mon Sep 17 00:00:00 2001 From: Jonathan Giddy Date: Mon, 1 May 2023 18:41:57 +0100 Subject: [PATCH] Move gzip header parsing out of bufread module Header parsing is used by both bufread and write modules. --- src/gz/bufread.rs | 227 +--------------------------------------------- src/gz/mod.rs | 224 ++++++++++++++++++++++++++++++++++++++++++++- src/gz/write.rs | 3 +- 3 files changed, 228 insertions(+), 226 deletions(-) diff --git a/src/gz/bufread.rs b/src/gz/bufread.rs index c6ac5a98..5de1e261 100644 --- a/src/gz/bufread.rs +++ b/src/gz/bufread.rs @@ -3,9 +3,12 @@ use std::io; use std::io::prelude::*; use std::mem; +use super::corrupt; +use super::read_gz_header_part; +use super::Buffer; +use super::GzHeaderPartial; use super::{GzBuilder, GzHeader}; -use super::{FCOMMENT, FEXTRA, FHCRC, FNAME}; -use crate::crc::{Crc, CrcReader}; +use crate::crc::CrcReader; use crate::deflate; use crate::Compression; @@ -18,112 +21,6 @@ fn copy(into: &mut [u8], from: &[u8], pos: &mut usize) -> usize { min } -pub(crate) fn corrupt() -> io::Error { - io::Error::new( - io::ErrorKind::InvalidInput, - "corrupt gzip stream does not have a matching checksum", - ) -} - -fn bad_header() -> io::Error { - io::Error::new(io::ErrorKind::InvalidInput, "invalid gzip header") -} - -fn read_le_u16(r: &mut Buffer) -> io::Result { - let mut b = [0; 2]; - r.read_and_forget(&mut b)?; - Ok((b[0] as u16) | ((b[1] as u16) << 8)) -} - -fn read_gz_header_part<'a, R: Read>(r: &'a mut Buffer<'a, R>) -> io::Result<()> { - loop { - match r.part.state { - GzHeaderParsingState::Start => { - let mut header = [0; 10]; - r.read_and_forget(&mut header)?; - - if header[0] != 0x1f || header[1] != 0x8b { - return Err(bad_header()); - } - if header[2] != 8 { - return Err(bad_header()); - } - - r.part.flg = header[3]; - r.part.header.mtime = ((header[4] as u32) << 0) - | ((header[5] as u32) << 8) - | ((header[6] as u32) << 16) - | ((header[7] as u32) << 24); - let _xfl = header[8]; - r.part.header.operating_system = header[9]; - r.part.state = GzHeaderParsingState::Xlen; - } - GzHeaderParsingState::Xlen => { - if r.part.flg & FEXTRA != 0 { - r.part.xlen = read_le_u16(r)?; - } - r.part.state = GzHeaderParsingState::Extra; - } - GzHeaderParsingState::Extra => { - if r.part.flg & FEXTRA != 0 { - let mut extra = vec![0; r.part.xlen as usize]; - r.read_and_forget(&mut extra)?; - r.part.header.extra = Some(extra); - } - r.part.state = GzHeaderParsingState::Filename; - } - GzHeaderParsingState::Filename => { - if r.part.flg & FNAME != 0 { - if r.part.header.filename.is_none() { - r.part.header.filename = Some(Vec::new()); - }; - for byte in r.bytes() { - let byte = byte?; - if byte == 0 { - break; - } - } - } - r.part.state = GzHeaderParsingState::Comment; - } - GzHeaderParsingState::Comment => { - if r.part.flg & FCOMMENT != 0 { - if r.part.header.comment.is_none() { - r.part.header.comment = Some(Vec::new()); - }; - for byte in r.bytes() { - let byte = byte?; - if byte == 0 { - break; - } - } - } - r.part.state = GzHeaderParsingState::Crc; - } - GzHeaderParsingState::Crc => { - if r.part.flg & FHCRC != 0 { - let stored_crc = read_le_u16(r)?; - let calced_crc = r.part.crc.sum() as u16; - if stored_crc != calced_crc { - return Err(corrupt()); - } - } - return Ok(()); - } - } - } -} - -pub(crate) fn read_gz_header(r: &mut R) -> io::Result { - let mut part = GzHeaderPartial::new(); - - let result = { - let mut reader = Buffer::new(&mut part, r); - read_gz_header_part(&mut reader) - }; - result.map(|()| part.take_header()) -} - /// A gzip streaming encoder /// /// This structure exposes a [`BufRead`] interface that will read uncompressed data @@ -311,49 +208,6 @@ pub struct GzDecoder { multi: bool, } -#[derive(Debug)] -pub enum GzHeaderParsingState { - Start, - Xlen, - Extra, - Filename, - Comment, - Crc, -} - -#[derive(Debug)] -pub struct GzHeaderPartial { - buf: Vec, - state: GzHeaderParsingState, - flg: u8, - xlen: u16, - crc: Crc, - header: GzHeader, -} - -impl GzHeaderPartial { - fn new() -> GzHeaderPartial { - GzHeaderPartial { - buf: Vec::with_capacity(10), // minimum header length - state: GzHeaderParsingState::Start, - flg: 0, - xlen: 0, - crc: Crc::new(), - header: GzHeader { - extra: None, - filename: None, - comment: None, - operating_system: 0, - mtime: 0, - }, - } - } - - pub fn take_header(self) -> GzHeader { - self.header - } -} - #[derive(Debug)] enum GzState { Header(GzHeaderPartial), @@ -363,77 +217,6 @@ enum GzState { End, } -/// A small adapter which reads data originally from `buf` and then reads all -/// further data from `reader`. This will also buffer all data read from -/// `reader` into `buf` for reuse on a further call. -struct Buffer<'a, T: 'a> { - part: &'a mut GzHeaderPartial, - buf_cur: usize, - buf_max: usize, - reader: &'a mut T, -} - -impl<'a, T> Buffer<'a, T> { - fn new(part: &'a mut GzHeaderPartial, reader: &'a mut T) -> Buffer<'a, T> { - Buffer { - reader, - buf_cur: 0, - buf_max: part.buf.len(), - part, - } - } -} - -impl<'a, T: Read> Read for Buffer<'a, T> { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - let mut bufref = match self.part.state { - GzHeaderParsingState::Filename => self.part.header.filename.as_mut(), - GzHeaderParsingState::Comment => self.part.header.comment.as_mut(), - _ => None, - }; - if let Some(ref mut b) = bufref { - // we have a direct reference to a buffer where to write - let len = self.reader.read(buf)?; - if len > 0 && buf[len - 1] == 0 { - // we do not append the final 0 - b.extend_from_slice(&buf[..len - 1]); - } else { - b.extend_from_slice(&buf[..len]); - } - self.part.crc.update(&buf[..len]); - Ok(len) - } else if self.buf_cur == self.buf_max { - // we read new bytes and also save them in self.part.buf - let len = self.reader.read(buf)?; - self.part.buf.extend_from_slice(&buf[..len]); - self.part.crc.update(&buf[..len]); - Ok(len) - } else { - // we first read the previously saved bytes - let len = (&self.part.buf[self.buf_cur..self.buf_max]).read(buf)?; - self.buf_cur += len; - Ok(len) - } - } -} - -impl<'a, T> Buffer<'a, T> -where - T: std::io::Read, -{ - // If we manage to read all the bytes, we reset the buffer - fn read_and_forget(&mut self, buf: &mut [u8]) -> io::Result { - self.read_exact(buf)?; - // we managed to read the whole buf - // we will no longer need the previously saved bytes in self.part.buf - let rlen = buf.len(); - self.part.buf.truncate(0); - self.buf_cur = 0; - self.buf_max = 0; - Ok(rlen) - } -} - impl GzDecoder { /// Creates a new decoder from the given reader, immediately parsing the /// gzip header. diff --git a/src/gz/mod.rs b/src/gz/mod.rs index d31aa60b..e72c43e0 100644 --- a/src/gz/mod.rs +++ b/src/gz/mod.rs @@ -1,9 +1,9 @@ use std::ffi::CString; -use std::io::prelude::*; +use std::io::{self, prelude::*}; use std::time; use crate::bufreader::BufReader; -use crate::Compression; +use crate::{Compression, Crc}; pub static FHCRC: u8 = 1 << 1; pub static FEXTRA: u8 = 1 << 2; @@ -82,6 +82,155 @@ impl GzHeader { } } +#[derive(Debug)] +pub enum GzHeaderParsingState { + Start, + Xlen, + Extra, + Filename, + Comment, + Crc, +} + +#[derive(Debug)] +pub struct GzHeaderPartial { + buf: Vec, + state: GzHeaderParsingState, + flg: u8, + xlen: u16, + crc: Crc, + header: GzHeader, +} + +impl GzHeaderPartial { + fn new() -> GzHeaderPartial { + GzHeaderPartial { + buf: Vec::with_capacity(10), // minimum header length + state: GzHeaderParsingState::Start, + flg: 0, + xlen: 0, + crc: Crc::new(), + header: GzHeader { + extra: None, + filename: None, + comment: None, + operating_system: 0, + mtime: 0, + }, + } + } + + pub fn take_header(self) -> GzHeader { + self.header + } +} + +fn read_gz_header_part<'a, R: Read>(r: &'a mut Buffer<'a, R>) -> io::Result<()> { + loop { + match r.part.state { + GzHeaderParsingState::Start => { + let mut header = [0; 10]; + r.read_and_forget(&mut header)?; + + if header[0] != 0x1f || header[1] != 0x8b { + return Err(bad_header()); + } + if header[2] != 8 { + return Err(bad_header()); + } + + r.part.flg = header[3]; + r.part.header.mtime = ((header[4] as u32) << 0) + | ((header[5] as u32) << 8) + | ((header[6] as u32) << 16) + | ((header[7] as u32) << 24); + let _xfl = header[8]; + r.part.header.operating_system = header[9]; + r.part.state = GzHeaderParsingState::Xlen; + } + GzHeaderParsingState::Xlen => { + if r.part.flg & FEXTRA != 0 { + r.part.xlen = read_le_u16(r)?; + } + r.part.state = GzHeaderParsingState::Extra; + } + GzHeaderParsingState::Extra => { + if r.part.flg & FEXTRA != 0 { + let mut extra = vec![0; r.part.xlen as usize]; + r.read_and_forget(&mut extra)?; + r.part.header.extra = Some(extra); + } + r.part.state = GzHeaderParsingState::Filename; + } + GzHeaderParsingState::Filename => { + if r.part.flg & FNAME != 0 { + if r.part.header.filename.is_none() { + r.part.header.filename = Some(Vec::new()); + }; + for byte in r.bytes() { + let byte = byte?; + if byte == 0 { + break; + } + } + } + r.part.state = GzHeaderParsingState::Comment; + } + GzHeaderParsingState::Comment => { + if r.part.flg & FCOMMENT != 0 { + if r.part.header.comment.is_none() { + r.part.header.comment = Some(Vec::new()); + }; + for byte in r.bytes() { + let byte = byte?; + if byte == 0 { + break; + } + } + } + r.part.state = GzHeaderParsingState::Crc; + } + GzHeaderParsingState::Crc => { + if r.part.flg & FHCRC != 0 { + let stored_crc = read_le_u16(r)?; + let calced_crc = r.part.crc.sum() as u16; + if stored_crc != calced_crc { + return Err(corrupt()); + } + } + return Ok(()); + } + } + } +} + +fn read_gz_header(r: &mut R) -> io::Result { + let mut part = GzHeaderPartial::new(); + + let result = { + let mut reader = Buffer::new(&mut part, r); + read_gz_header_part(&mut reader) + }; + result.map(|()| part.take_header()) +} + +fn read_le_u16(r: &mut Buffer) -> io::Result { + let mut b = [0; 2]; + r.read_and_forget(&mut b)?; + Ok((b[0] as u16) | ((b[1] as u16) << 8)) +} + +fn bad_header() -> io::Error { + io::Error::new(io::ErrorKind::InvalidInput, "invalid gzip header") +} + +fn corrupt() -> io::Error { + io::Error::new( + io::ErrorKind::InvalidInput, + "corrupt gzip stream does not have a matching checksum", + ) +} + /// A builder structure to create a new gzip Encoder. /// /// This structure controls header configuration options such as the filename. @@ -249,6 +398,77 @@ impl GzBuilder { } } +/// A small adapter which reads data originally from `buf` and then reads all +/// further data from `reader`. This will also buffer all data read from +/// `reader` into `buf` for reuse on a further call. +struct Buffer<'a, T: 'a> { + part: &'a mut GzHeaderPartial, + buf_cur: usize, + buf_max: usize, + reader: &'a mut T, +} + +impl<'a, T> Buffer<'a, T> { + fn new(part: &'a mut GzHeaderPartial, reader: &'a mut T) -> Buffer<'a, T> { + Buffer { + reader, + buf_cur: 0, + buf_max: part.buf.len(), + part, + } + } +} + +impl<'a, T: Read> Read for Buffer<'a, T> { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + let mut bufref = match self.part.state { + GzHeaderParsingState::Filename => self.part.header.filename.as_mut(), + GzHeaderParsingState::Comment => self.part.header.comment.as_mut(), + _ => None, + }; + if let Some(ref mut b) = bufref { + // we have a direct reference to a buffer where to write + let len = self.reader.read(buf)?; + if len > 0 && buf[len - 1] == 0 { + // we do not append the final 0 + b.extend_from_slice(&buf[..len - 1]); + } else { + b.extend_from_slice(&buf[..len]); + } + self.part.crc.update(&buf[..len]); + Ok(len) + } else if self.buf_cur == self.buf_max { + // we read new bytes and also save them in self.part.buf + let len = self.reader.read(buf)?; + self.part.buf.extend_from_slice(&buf[..len]); + self.part.crc.update(&buf[..len]); + Ok(len) + } else { + // we first read the previously saved bytes + let len = (&self.part.buf[self.buf_cur..self.buf_max]).read(buf)?; + self.buf_cur += len; + Ok(len) + } + } +} + +impl<'a, T> Buffer<'a, T> +where + T: std::io::Read, +{ + // If we manage to read all the bytes, we reset the buffer + fn read_and_forget(&mut self, buf: &mut [u8]) -> io::Result { + self.read_exact(buf)?; + // we managed to read the whole buf + // we will no longer need the previously saved bytes in self.part.buf + let rlen = buf.len(); + self.part.buf.truncate(0); + self.buf_cur = 0; + self.buf_max = 0; + Ok(rlen) + } +} + #[cfg(test)] mod tests { use std::io::prelude::*; diff --git a/src/gz/write.rs b/src/gz/write.rs index 83eebb75..5336a17e 100644 --- a/src/gz/write.rs +++ b/src/gz/write.rs @@ -2,8 +2,7 @@ use std::cmp; use std::io; use std::io::prelude::*; -use super::bufread::{corrupt, read_gz_header}; -use super::{GzBuilder, GzHeader}; +use super::{corrupt, read_gz_header, GzBuilder, GzHeader}; use crate::crc::{Crc, CrcWriter}; use crate::zio; use crate::{Compress, Compression, Decompress, Status};