diff --git a/CHANGELOG.md b/CHANGELOG.md index fc2e95e..a64e41a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +# 0.9.0 (2019-06-16) + +- Split decompress and tar adapter so we can also read pure .bz2 files etc +- Add mime type detection to decompress so we can read e.g. /boot/initramfs.img which is a bz2 file without ending + # 0.8.9 (2019-06-15) - Finally fix linux binary package diff --git a/exampledir/compress/test.log b/exampledir/compress/test.log new file mode 100644 index 0000000..560c37f --- /dev/null +++ b/exampledir/compress/test.log @@ -0,0 +1,2 @@ +hello world +this is a test diff --git a/exampledir/compress/test.log.bz2 b/exampledir/compress/test.log.bz2 new file mode 100644 index 0000000..1c3a872 Binary files /dev/null and b/exampledir/compress/test.log.bz2 differ diff --git a/exampledir/compress/test.log.gz b/exampledir/compress/test.log.gz new file mode 100644 index 0000000..ed0a24c Binary files /dev/null and b/exampledir/compress/test.log.gz differ diff --git a/exampledir/compress/test.log.xz b/exampledir/compress/test.log.xz new file mode 100644 index 0000000..b2f182d Binary files /dev/null and b/exampledir/compress/test.log.xz differ diff --git a/exampledir/compress/test.log.zst b/exampledir/compress/test.log.zst new file mode 100644 index 0000000..4b93198 Binary files /dev/null and b/exampledir/compress/test.log.zst differ diff --git a/src/adapters.rs b/src/adapters.rs index cdcc6ce..c2c4729 100644 --- a/src/adapters.rs +++ b/src/adapters.rs @@ -1,3 +1,4 @@ +pub mod decompress; pub mod ffmpeg; pub mod pandoc; pub mod pdfpages; @@ -11,7 +12,7 @@ use crate::matching::*; use crate::preproc::PreprocConfig; use failure::*; use log::*; -use regex::{Regex}; +use regex::Regex; use std::borrow::Cow; use std::collections::HashMap; use std::io::prelude::*; @@ -78,6 +79,7 @@ pub fn get_all_adapters() -> (Vec>, Vec> Rc::new(pandoc::PandocAdapter::new()), Rc::new(poppler::PopplerAdapter::new()), Rc::new(zip::ZipAdapter::new()), + Rc::new(decompress::DecompressAdapter::new()), Rc::new(tar::TarAdapter::new()), Rc::new(sqlite::SqliteAdapter::new()), ]; diff --git a/src/adapters/decompress.rs b/src/adapters/decompress.rs new file mode 100644 index 0000000..0902d66 --- /dev/null +++ b/src/adapters/decompress.rs @@ -0,0 +1,125 @@ +use super::*; +use crate::preproc::rga_preproc; +use failure::*; +use lazy_static::lazy_static; + +use std::path::PathBuf; + +static EXTENSIONS: &[&str] = &["tgz", "tbz", "tbz2", "gz", "bz2", "xz", "zst"]; +static MIME_TYPES: &[&str] = &[ + "application/gzip", + "application/x-bzip", + "application/x-xz", + "application/zstd", +]; +lazy_static! { + static ref METADATA: AdapterMeta = AdapterMeta { + name: "decompress".to_owned(), + version: 1, + description: + "Reads compressed file as a stream and runs a different extractor on the contents." + .to_owned(), + fast_matchers: EXTENSIONS + .iter() + .map(|s| FastMatcher::FileExtension(s.to_string())) + .collect(), + slow_matchers: Some( + MIME_TYPES + .iter() + .map(|s| SlowMatcher::MimeType(s.to_string())) + .collect() + ), + }; +} +#[derive(Default)] +pub struct DecompressAdapter; + +impl DecompressAdapter { + pub fn new() -> DecompressAdapter { + DecompressAdapter + } +} +impl GetMetadata for DecompressAdapter { + fn metadata(&self) -> &AdapterMeta { + &METADATA + } +} + +fn decompress_any<'a, R>(filename: &Path, inp: &'a mut R) -> Fallible> +where + R: Read, +{ + let extension = filename.extension().map(|e| e.to_string_lossy().to_owned()); + + match extension { + Some(e) => Ok(match e.to_owned().as_ref() { + "tgz" | "gz" => Box::new(flate2::read::MultiGzDecoder::new(inp)), + "tbz" | "tbz2" | "bz2" => Box::new(bzip2::read::BzDecoder::new(inp)), + "xz" => Box::new(xz2::read::XzDecoder::new_multi_decoder(inp)), + "zst" => Box::new(zstd::stream::read::Decoder::new(inp)?), + ext => Err(format_err!("don't know how to decompress {}", ext))?, + }), + None => Err(format_err!("no extension")), + } +} +fn get_inner_filename(filename: &Path) -> PathBuf { + let extension = filename + .extension() + .map(|e| e.to_string_lossy().to_owned()) + .unwrap_or(Cow::Borrowed("")); + let stem = filename + .file_stem() + .expect("no filename given?") + .to_string_lossy(); + let new_extension = match extension.to_owned().as_ref() { + "tgz" | "tbz" | "tbz2" => ".tar", + _other => "", + }; + filename.with_file_name(format!("{}{}", stem, new_extension)) +} + +impl FileAdapter for DecompressAdapter { + fn adapt(&self, ai: AdaptInfo) -> Fallible<()> { + let AdaptInfo { + filepath_hint, + mut inp, + oup, + line_prefix, + archive_recursion_depth, + config, + .. + } = ai; + + let mut decompress = decompress_any(filepath_hint, &mut inp)?; + let ai2: AdaptInfo = AdaptInfo { + filepath_hint: &get_inner_filename(filepath_hint), + is_real_file: false, + archive_recursion_depth: archive_recursion_depth + 1, + inp: &mut decompress, + oup, + line_prefix, + config: config.clone(), + }; + rga_preproc(ai2)?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_inner_filename() { + for (a, b) in &[ + ("hi/test.tgz", "hi/test.tar"), + ("hi/hello.gz", "hi/hello"), + ("a/b/initramfs", "a/b/initramfs"), + ("hi/test.tbz2", "hi/test.tar"), + ("hi/test.tbz", "hi/test.tar"), + ("hi/test.hi.bz2", "hi/test.hi"), + ("hello.tar.gz", "hello.tar"), + ] { + assert_eq!(get_inner_filename(&PathBuf::from(a)).to_string_lossy(), *b); + } + } +} diff --git a/src/adapters/tar.rs b/src/adapters/tar.rs index 367f7fc..acd03d5 100644 --- a/src/adapters/tar.rs +++ b/src/adapters/tar.rs @@ -6,7 +6,7 @@ use lazy_static::lazy_static; use std::path::PathBuf; -static EXTENSIONS: &[&str] = &["tar", "tar.gz", "tar.bz2", "tar.xz", "tar.zst"]; +static EXTENSIONS: &[&str] = &["tar"]; lazy_static! { static ref METADATA: AdapterMeta = AdapterMeta { @@ -34,24 +34,6 @@ impl GetMetadata for TarAdapter { } } -fn decompress_any<'a, R>(filename: &Path, inp: &'a mut R) -> Fallible> -where - R: Read, -{ - let extension = filename.extension().map(|e| e.to_string_lossy().to_owned()); - match extension { - Some(e) => Ok(match e.to_owned().as_ref() { - "tgz" | "gz" => Box::new(flate2::read::MultiGzDecoder::new(inp)), - "tbz" | "tbz2" | "bz2" => Box::new(bzip2::read::BzDecoder::new(inp)), - "xz" => Box::new(xz2::read::XzDecoder::new_multi_decoder(inp)), - "zst" => Box::new(zstd::stream::read::Decoder::new(inp)?), - "tar" => Box::new(inp), - ext => Err(format_err!("don't know how to decompress {}", ext))?, - }), - None => Err(format_err!("no extension")), - } -} - impl FileAdapter for TarAdapter { fn adapt(&self, ai: AdaptInfo) -> Fallible<()> { let AdaptInfo { @@ -63,9 +45,7 @@ impl FileAdapter for TarAdapter { config, .. } = ai; - - let decompress = decompress_any(filepath_hint, &mut inp)?; - let mut archive = ::tar::Archive::new(decompress); + let mut archive = ::tar::Archive::new(&mut inp); for entry in archive.entries()? { let mut file = entry.unwrap(); if Regular == file.header().entry_type() { diff --git a/src/preproc.rs b/src/preproc.rs index 0d2e5fe..9cb7b2e 100644 --- a/src/preproc.rs +++ b/src/preproc.rs @@ -134,14 +134,17 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> { } } None => { - // allow passthrough if the file is in an archive, + // allow passthrough if the file is in an archive or accurate matching is enabled // otherwise it should have been filtered out by rg pre-glob since rg can handle those better than us - let allow_cat = !is_real_file; + let allow_cat = !is_real_file || args.accurate; if allow_cat { spawning::postproc_line_prefix(line_prefix, inp, oup)?; Ok(()) } else { - Err(format_err!("No adapter found for file {:?}", filename)) + Err(format_err!( + "No adapter found for file {:?}, passthrough disabled.", + filename + )) } } }