From 0489a49d66bd89a98d15236b8b8a7b06a4f565ba Mon Sep 17 00:00:00 2001 From: phiresky Date: Tue, 11 Jun 2019 13:34:04 +0200 Subject: [PATCH] add slow matching (base) --- src/adapters/ffmpeg.rs | 5 +- src/adapters/mod.rs | 109 +++++++++++++++++++++++++++++++--------- src/adapters/pandoc.rs | 7 +-- src/adapters/poppler.rs | 5 +- src/adapters/sqlite.rs | 7 ++- src/adapters/tar.rs | 5 +- src/adapters/zip.rs | 5 +- src/args.rs | 31 +++++++----- src/bin/rga-preproc.rs | 2 +- src/bin/rga.rs | 12 ++--- src/caching_writer.rs | 2 +- src/preproc.rs | 10 ++-- 12 files changed, 137 insertions(+), 63 deletions(-) diff --git a/src/adapters/ffmpeg.rs b/src/adapters/ffmpeg.rs index d4c0877..efc5cbf 100644 --- a/src/adapters/ffmpeg.rs +++ b/src/adapters/ffmpeg.rs @@ -15,10 +15,11 @@ lazy_static! { name: "ffmpeg".to_owned(), version: 1, description: "Uses ffmpeg to extract video metadata and subtitles".to_owned(), - matchers: EXTENSIONS + fast_matchers: EXTENSIONS .iter() - .map(|s| Matcher::FileExtension(s.to_string())) + .map(|s| FastMatcher::FileExtension(s.to_string())) .collect(), + slow_matchers: None }; } diff --git a/src/adapters/mod.rs b/src/adapters/mod.rs index 33d42d0..e1b6062 100644 --- a/src/adapters/mod.rs +++ b/src/adapters/mod.rs @@ -9,19 +9,34 @@ use crate::preproc::PreprocConfig; use failure::*; use log::*; use regex::{Regex, RegexSet}; +use std::borrow::Borrow; +use std::borrow::Cow; use std::collections::HashMap; use std::io::prelude::*; +use std::iter::Iterator; use std::path::Path; use std::rc::Rc; -//pub use ffmpeg::FffmpegAdapter; -pub enum Matcher { +#[derive(Clone)] +pub enum FastMatcher { // MimeType(Regex), /** - * without the dot. e.g. "jpg" or "tar.gz" matched as /.*\.ext$/ + * without the leading dot, e.g. "jpg" or "tar.gz". Matched as /.*\.ext$/ * */ FileExtension(String), + // todo: maybe add others, e.g. regex on whole filename or even paths + // todo: maybe allow matching a directory (e.g. /var/lib/postgres) +} + +#[derive(Clone)] +pub enum SlowMatcher { + /// any type of fast matcher + Fast(FastMatcher), + /// + /// match by exact mime type extracted using tree_magic + /// TODO: allow match ignoring suffix etc? + MimeType(String), } pub struct AdapterMeta { @@ -30,14 +45,32 @@ pub struct AdapterMeta { /// version identifier. used to key cache entries, change if your output format changes pub version: i32, pub description: String, - pub matchers: Vec, + /// list of matchers (interpreted as ORed) + pub fast_matchers: Vec, + /// list of matchers when we have mime type detection active (interpreted as ORed) + /// warning: this *overrides* the fast matchers + pub slow_matchers: Option>, +} +impl AdapterMeta { + // todo: this is pretty ugly + fn get_matchers<'a>(&'a self, slow: bool) -> Box> + 'a> { + match (slow, &self.slow_matchers) { + (true, Some(ref sm)) => Box::new(sm.iter().map(|e| Cow::Borrowed(e))), + (_, _) => Box::new( + self.fast_matchers + .iter() + .map(|e| Cow::Owned(SlowMatcher::Fast(e.clone()))), + ), + } + } } pub struct FileMeta { // filename is not actually a utf8 string, but since we can't do regex on OsStr and can't get a &[u8] from OsStr either, // and since we probably only want to do only matching on ascii stuff anyways, this is the filename as a string with non-valid bytes removed pub lossy_filename: String, - // pub mimetype: String, + // only given when slow matching is enabled + pub mimetype: Option, } pub trait GetMetadata { @@ -79,7 +112,9 @@ pub fn get_adapters() -> Vec> { adapters } -pub fn get_adapters_filtered(adapter_names: &Vec) -> Fallible>> { +pub fn get_adapters_filtered>( + adapter_names: &[T], +) -> Fallible>> { let all_adapters = get_adapters(); let adapters = if !adapter_names.is_empty() { let adapters_map: HashMap<_, _> = all_adapters @@ -89,8 +124,8 @@ pub fn get_adapters_filtered(adapter_names: &Vec) -> Fallible) -> Fallible) -> Fallible, + +pub fn adapter_matcher>( + adapter_names: &[T], + slow: bool, ) -> Fallible Option>> { let adapters = get_adapters_filtered(adapter_names)?; let mut fname_regexes = vec![]; - //let mut mime_regexes = vec![]; + let mut mime_regexes = vec![]; for adapter in adapters.into_iter() { let metadata = adapter.metadata(); - for matcher in &metadata.matchers { - match matcher { - //Matcher::MimeType(re) => mime_regexes.push((re.clone(), adapter.clone())), - Matcher::FileExtension(re) => { + use SlowMatcher::*; + for matcher in metadata.get_matchers(slow) { + match matcher.as_ref() { + MimeType(re) => mime_regexes.push((re.clone(), adapter.clone())), + Fast(FastMatcher::FileExtension(re)) => { fname_regexes.push((extension_to_regex(re), adapter.clone())) } }; } } let fname_regex_set = RegexSet::new(fname_regexes.iter().map(|p| p.0.as_str()))?; - //let mime_regex_set = RegexSet::new(mime_regexes.iter().map(|p| p.0.as_str()))?; + let mime_regex_set = RegexSet::new(mime_regexes.iter().map(|p| p.0.as_str()))?; Ok(move |meta: FileMeta| { - // todo: handle multiple conflicting matches - let matches = fname_regex_set.matches(&meta.lossy_filename); - match matches.iter().next() { - Some(m) => Some(fname_regexes[m].1.clone()), - None => None, + let fname_matches: Vec<_> = fname_regex_set + .matches(&meta.lossy_filename) + .into_iter() + .collect(); + let mime_matches: Vec<_> = if slow { + mime_regex_set + .matches(&meta.mimetype.expect("No mimetype?")) + .into_iter() + .collect() + } else { + vec![] + }; + if fname_matches.len() + mime_matches.len() > 1 { + eprintln!("Found multiple adapters for {}:", meta.lossy_filename); + for mmatch in mime_matches.iter() { + eprintln!(" - {}", mime_regexes[*mmatch].1.metadata().name); + } + for fmatch in fname_matches.iter() { + eprintln!(" - {}", fname_regexes[*fmatch].1.metadata().name); + } + } + if mime_matches.len() == 0 { + if fname_matches.len() == 0 { + None + } else { + Some(fname_regexes[fname_matches[0]].1.clone()) + } + } else { + Some(mime_regexes[mime_matches[0]].1.clone()) } - /*for m in mime_regex_set.matches(&meta.mimetype) { - return Some(mime_regexes[m].1.clone()); - }*/ }) } diff --git a/src/adapters/pandoc.rs b/src/adapters/pandoc.rs index d37de4c..eb81b02 100644 --- a/src/adapters/pandoc.rs +++ b/src/adapters/pandoc.rs @@ -4,7 +4,7 @@ use spawning::SpawningFileAdapter; use std::process::Command; // from https://github.com/jgm/pandoc/blob/master/src/Text/Pandoc/App/FormatHeuristics.hs -// excluding formats that could cause problems (db = sqlite) or that are already text formats (e.g. xml-based) +// excluding formats that could cause problems (.db ?= sqlite) or that are already text formats (e.g. xml-based) //"db" -> Just "docbook" //"adoc" -> Just "asciidoc" //"asciidoc" -> Just "asciidoc" @@ -46,10 +46,11 @@ lazy_static! { name: "pandoc".to_owned(), version: 1, description: "Uses pandoc to convert binary/unreadable text documents to plain text markdown-like text".to_owned(), - matchers: EXTENSIONS + fast_matchers: EXTENSIONS .iter() - .map(|s| Matcher::FileExtension(s.to_string())) + .map(|s| FastMatcher::FileExtension(s.to_string())) .collect(), + slow_matchers: None }; } #[derive(Default)] diff --git a/src/adapters/poppler.rs b/src/adapters/poppler.rs index c57ad65..e7151ae 100644 --- a/src/adapters/poppler.rs +++ b/src/adapters/poppler.rs @@ -12,10 +12,11 @@ lazy_static! { version: 1, description: "Uses pdftotext (from poppler-utils) to extract plain text from PDF files" .to_owned(), - matchers: EXTENSIONS + fast_matchers: EXTENSIONS .iter() - .map(|s| Matcher::FileExtension(s.to_string())) + .map(|s| FastMatcher::FileExtension(s.to_string())) .collect(), + slow_matchers: None }; } #[derive(Default)] diff --git a/src/adapters/sqlite.rs b/src/adapters/sqlite.rs index 72c4a72..e9e0266 100644 --- a/src/adapters/sqlite.rs +++ b/src/adapters/sqlite.rs @@ -14,10 +14,13 @@ lazy_static! { description: "Uses sqlite bindings to convert sqlite databases into a simple plain text format" .to_owned(), - matchers: EXTENSIONS + fast_matchers: EXTENSIONS .iter() - .map(|s| Matcher::FileExtension(s.to_string())) + .map(|s| FastMatcher::FileExtension(s.to_string())) .collect(), + slow_matchers: Some(vec![SlowMatcher::MimeType( + "application/x-sqlite3".to_owned() + )]) }; } diff --git a/src/adapters/tar.rs b/src/adapters/tar.rs index e74ec48..aae2190 100644 --- a/src/adapters/tar.rs +++ b/src/adapters/tar.rs @@ -13,10 +13,11 @@ lazy_static! { name: "tar".to_owned(), version: 1, description: "Reads a tar file as a stream and recurses down into its contents".to_owned(), - matchers: EXTENSIONS + fast_matchers: EXTENSIONS .iter() - .map(|s| Matcher::FileExtension(s.to_string())) + .map(|s| FastMatcher::FileExtension(s.to_string())) .collect(), + slow_matchers: None }; } #[derive(Default)] diff --git a/src/adapters/zip.rs b/src/adapters/zip.rs index 9df5e19..8312421 100644 --- a/src/adapters/zip.rs +++ b/src/adapters/zip.rs @@ -14,10 +14,11 @@ lazy_static! { name: "zip".to_owned(), version: 1, description: "Reads a zip file as a stream and recurses down into its contents".to_owned(), - matchers: EXTENSIONS + fast_matchers: EXTENSIONS .iter() - .map(|s| Matcher::FileExtension(s.to_string())) + .map(|s| FastMatcher::FileExtension(s.to_string())) .collect(), + slow_matchers: None }; } #[derive(Default)] diff --git a/src/args.rs b/src/args.rs index e811839..730c482 100644 --- a/src/args.rs +++ b/src/args.rs @@ -32,58 +32,65 @@ set_default!(max_archive_recursion, 4, i32); #[structopt(rename_all = "kebab-case", set_term_width = 80)] pub struct RgaArgs { #[serde(default, skip_serializing_if = "is_default")] - #[structopt(long, help = "Disable caching of results")] - pub rga_no_cache: bool, + #[structopt(long = "--rga-no-cache", help = "Disable caching of results")] + pub no_cache: bool, #[serde(default, skip_serializing_if = "is_default")] #[structopt( - long, + long = "--rga-accurate", + help = "Use more accurate but slower matching by mime type" + )] + pub accurate: bool, + + #[serde(default, skip_serializing_if = "is_default")] + #[structopt( + long = "--rga-adapters", require_equals = true, require_delimiter = true, help = "Change which adapters to use and in which priority order (descending)" )] - pub rga_adapters: Vec, + pub adapters: Vec, #[serde( default = "def_cache_max_blob_len", skip_serializing_if = "def_cache_max_blob_len_if" )] #[structopt( - long, + long = "--rga-cache-max-blob-len", default_value = "2000000", help = "Max compressed size to cache", long_help = "Longest byte length (after compression) to store in cache. Longer adapter outputs will not be cached and recomputed every time." )] - pub rga_cache_max_blob_len: u32, + pub cache_max_blob_len: u32, #[serde( default = "def_cache_compression_level", skip_serializing_if = "def_cache_compression_level_if" )] #[structopt( - long, + long = "--rga-cache-compression-level", default_value = "12", require_equals = true, help = "ZSTD compression level to apply to adapter outputs before storing in cache db" )] - pub rga_cache_compression_level: u32, + pub cache_compression_level: u32, #[serde( default = "def_max_archive_recursion", skip_serializing_if = "def_max_archive_recursion_if" )] #[structopt( - long, + long = "--rga-max-archive-recursion", default_value = "4", require_equals = true, help = "Maximum nestedness of archives to recurse into" )] - pub rga_max_archive_recursion: i32, + pub max_archive_recursion: i32, // these arguments stop the process, so don't serialize them #[serde(skip)] - #[structopt(long, help = "List all known adapters")] - pub rga_list_adapters: bool, + #[structopt(long = "--rga-list-adapters", help = "List all known adapters")] + pub list_adapters: bool, #[serde(skip)] #[structopt(long, help = "Show help for ripgrep itself")] diff --git a/src/bin/rga-preproc.rs b/src/bin/rga-preproc.rs index 90add0c..690ddfc 100644 --- a/src/bin/rga-preproc.rs +++ b/src/bin/rga-preproc.rs @@ -21,7 +21,7 @@ fn main() -> Fallible<()> { let i = File::open(&path)?; let mut o = std::io::stdout(); - let cache = if args.rga_no_cache { + let cache = if args.no_cache { None } else { Some(rga::preproc_cache::open()?) diff --git a/src/bin/rga.rs b/src/bin/rga.rs index 38e6d53..5dc1f73 100644 --- a/src/bin/rga.rs +++ b/src/bin/rga.rs @@ -62,17 +62,17 @@ fn main() -> Fallible<()> { env_logger::init(); let (args, passthrough_args) = split_args()?; - let adapters = get_adapters_filtered(&args.rga_adapters)?; + let adapters = get_adapters_filtered(&args.adapters)?; - if args.rga_list_adapters { + if args.list_adapters { println!("Adapters:\n"); for adapter in adapters { let meta = adapter.metadata(); let matchers = meta - .matchers + .fast_matchers .iter() .map(|m| match m { - Matcher::FileExtension(ext) => format!(".{}", ext), + FastMatcher::FileExtension(ext) => format!(".{}", ext), }) .collect::>() .join(", "); @@ -87,9 +87,9 @@ fn main() -> Fallible<()> { let extensions = adapters .iter() - .flat_map(|a| &a.metadata().matchers) + .flat_map(|a| &a.metadata().fast_matchers) .filter_map(|m| match m { - Matcher::FileExtension(ext) => Some(ext as &str), + FastMatcher::FileExtension(ext) => Some(ext as &str), }) .collect::>() .join(","); diff --git a/src/caching_writer.rs b/src/caching_writer.rs index 305a645..f681043 100644 --- a/src/caching_writer.rs +++ b/src/caching_writer.rs @@ -47,7 +47,7 @@ impl Write for CachingWriter { Some(writer) => { let wrote = writer.write(buf)?; let compressed_len = writer.get_ref().len(); - //eprintln!("wrote {} to zstd, len now {}", wrote, compressed_len); + trace!("wrote {} to zstd, len now {}", wrote, compressed_len); if compressed_len > self.max_cache_size { eprintln!("cache longer than max, dropping"); //writer.finish(); diff --git a/src/preproc.rs b/src/preproc.rs index eda70fa..0af12e1 100644 --- a/src/preproc.rs +++ b/src/preproc.rs @@ -20,7 +20,6 @@ pub struct PreprocConfig<'a> { * */ pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> { - let adapters = adapter_matcher(&ai.config.args.rga_adapters)?; let AdaptInfo { filepath_hint, is_real_file, @@ -32,11 +31,12 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> { .. } = ai; let PreprocConfig { mut cache, args } = config; + let adapters = adapter_matcher(&args.adapters[..], args.accurate)?; let filename = filepath_hint .file_name() .ok_or_else(|| format_err!("Empty filename"))?; eprintln!("depth: {}", archive_recursion_depth); - if archive_recursion_depth >= args.rga_max_archive_recursion { + if archive_recursion_depth >= args.max_archive_recursion { writeln!(oup, "{}[rga: max archive recursion reached]", line_prefix)?; return Ok(()); } @@ -49,7 +49,7 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> { )))?; println!("mimetype: {:?}", mimetype);*/ let adapter = adapters(FileMeta { - // mimetype, + mimetype: None, lossy_filename: filename.to_string_lossy().to_string(), }); match adapter { @@ -77,8 +77,8 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> { // wrapping BufWriter here gives ~10% perf boost let mut compbuf = BufWriter::new(CachingWriter::new( oup, - args.rga_cache_max_blob_len.try_into().unwrap(), - args.rga_cache_compression_level.try_into().unwrap(), + args.cache_max_blob_len.try_into().unwrap(), + args.cache_compression_level.try_into().unwrap(), )?); eprintln!("adapting..."); ad.adapt(AdaptInfo {