From c8f346c4ddae9a3b18639088bec277336b01e94a Mon Sep 17 00:00:00 2001 From: phiresky Date: Wed, 12 Jun 2019 12:25:02 +0200 Subject: [PATCH] restructure --- src/{adapters/mod.rs => adapters.rs} | 109 ++----------------------- src/adapters/tar.rs | 4 +- src/bin/rga.rs | 1 + src/lib.rs | 1 + src/matching.rs | 118 +++++++++++++++++++++++++++ src/preproc.rs | 1 + 6 files changed, 129 insertions(+), 105 deletions(-) rename src/{adapters/mod.rs => adapters.rs} (53%) create mode 100644 src/matching.rs diff --git a/src/adapters/mod.rs b/src/adapters.rs similarity index 53% rename from src/adapters/mod.rs rename to src/adapters.rs index 8cddc64..739d4f3 100644 --- a/src/adapters/mod.rs +++ b/src/adapters.rs @@ -5,11 +5,11 @@ pub mod spawning; pub mod sqlite; pub mod tar; pub mod zip; +use crate::matching::*; use crate::preproc::PreprocConfig; use failure::*; use log::*; -use regex::{Regex, RegexSet}; - +use regex::{Regex}; use std::borrow::Cow; use std::collections::HashMap; use std::io::prelude::*; @@ -17,28 +17,6 @@ use std::iter::Iterator; use std::path::Path; use std::rc::Rc; -#[derive(Clone)] -pub enum FastMatcher { - // MimeType(Regex), - /** - * without the leading dot, e.g. "jpg" or "tar.gz". Matched as /.*\.ext$/ - * - */ - FileExtension(String), - // todo: maybe add others, e.g. regex on whole filename or even paths - // todo: maybe allow matching a directory (e.g. /var/lib/postgres) -} - -#[derive(Clone)] -pub enum SlowMatcher { - /// any type of fast matcher - Fast(FastMatcher), - /// - /// match by exact mime type extracted using tree_magic - /// TODO: allow match ignoring suffix etc? - MimeType(String), -} - pub struct AdapterMeta { /// unique short name of this adapter (a-z0-9 only) pub name: String, @@ -53,7 +31,10 @@ pub struct AdapterMeta { } impl AdapterMeta { // todo: this is pretty ugly - fn get_matchers<'a>(&'a self, slow: bool) -> Box> + 'a> { + pub fn get_matchers<'a>( + &'a self, + slow: bool, + ) -> Box> + 'a> { match (slow, &self.slow_matchers) { (true, Some(ref sm)) => Box::new(sm.iter().map(|e| Cow::Borrowed(e))), (_, _) => Box::new( @@ -65,14 +46,6 @@ impl AdapterMeta { } } -pub struct FileMeta { - // filename is not actually a utf8 string, but since we can't do regex on OsStr and can't get a &[u8] from OsStr either, - // and since we probably only want to do only matching on ascii stuff anyways, this is the filename as a string with non-valid bytes removed - pub lossy_filename: String, - // only given when slow matching is enabled - pub mimetype: Option, -} - pub trait GetMetadata { fn metadata(&self) -> &AdapterMeta; } @@ -160,73 +133,3 @@ pub fn get_adapters_filtered>( ); Ok(adapters) } - -pub fn adapter_matcher>( - adapter_names: &[T], - slow: bool, -) -> Fallible Option>> { - let adapters = get_adapters_filtered(adapter_names)?; - // need order later - let adapter_names: Vec = adapters.iter().map(|e| e.metadata().name.clone()).collect(); - let mut fname_regexes = vec![]; - let mut mime_regexes = vec![]; - for adapter in adapters.into_iter() { - let metadata = adapter.metadata(); - use SlowMatcher::*; - for matcher in metadata.get_matchers(slow) { - match matcher.as_ref() { - MimeType(re) => mime_regexes.push((re.clone(), adapter.clone())), - Fast(FastMatcher::FileExtension(re)) => { - fname_regexes.push((extension_to_regex(re), adapter.clone())) - } - }; - } - } - let fname_regex_set = RegexSet::new(fname_regexes.iter().map(|p| p.0.as_str()))?; - let mime_regex_set = RegexSet::new(mime_regexes.iter().map(|p| p.0.as_str()))?; - Ok(move |meta: FileMeta| { - let fname_matches: Vec<_> = fname_regex_set - .matches(&meta.lossy_filename) - .into_iter() - .collect(); - let mime_matches: Vec<_> = if slow { - mime_regex_set - .matches(&meta.mimetype.expect("No mimetype?")) - .into_iter() - .collect() - } else { - vec![] - }; - if fname_matches.len() + mime_matches.len() > 1 { - // get first according to original priority list... - let fa = fname_matches.iter().map(|e| fname_regexes[*e].1.clone()); - let fb = mime_matches.iter().map(|e| mime_regexes[*e].1.clone()); - let mut v = vec![]; - v.extend(fa); - v.extend(fb); - v.sort_by_key(|e| { - (adapter_names - .iter() - .position(|r| r == &e.metadata().name) - .expect("impossib7")) - }); - eprintln!( - "Warning: found multiple adapters for {}:", - meta.lossy_filename - ); - for mmatch in v.iter() { - eprintln!(" - {}", mmatch.metadata().name); - } - return Some(v[0].clone()); - } - if mime_matches.is_empty() { - if fname_matches.is_empty() { - None - } else { - Some(fname_regexes[fname_matches[0]].1.clone()) - } - } else { - Some(mime_regexes[mime_matches[0]].1.clone()) - } - }) -} diff --git a/src/adapters/tar.rs b/src/adapters/tar.rs index 942e5f4..367f7fc 100644 --- a/src/adapters/tar.rs +++ b/src/adapters/tar.rs @@ -41,8 +41,8 @@ where let extension = filename.extension().map(|e| e.to_string_lossy().to_owned()); match extension { Some(e) => Ok(match e.to_owned().as_ref() { - "gz" => Box::new(flate2::read::MultiGzDecoder::new(inp)), - "bz2" => Box::new(bzip2::read::BzDecoder::new(inp)), + "tgz" | "gz" => Box::new(flate2::read::MultiGzDecoder::new(inp)), + "tbz" | "tbz2" | "bz2" => Box::new(bzip2::read::BzDecoder::new(inp)), "xz" => Box::new(xz2::read::XzDecoder::new_multi_decoder(inp)), "zst" => Box::new(zstd::stream::read::Decoder::new(inp)?), "tar" => Box::new(inp), diff --git a/src/bin/rga.rs b/src/bin/rga.rs index 234179a..aa4dafb 100644 --- a/src/bin/rga.rs +++ b/src/bin/rga.rs @@ -2,6 +2,7 @@ use failure::Fallible; use rga::adapters::spawning::map_exe_error; use rga::adapters::*; use rga::args::*; +use rga::matching::*; use ripgrep_all as rga; use std::process::Command; diff --git a/src/lib.rs b/src/lib.rs index f9bddac..05bcd0d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,6 +3,7 @@ pub mod adapters; pub mod args; mod caching_writer; +pub mod matching; pub mod preproc; pub mod preproc_cache; pub use caching_writer::CachingWriter; diff --git a/src/matching.rs b/src/matching.rs new file mode 100644 index 0000000..2e10aa7 --- /dev/null +++ b/src/matching.rs @@ -0,0 +1,118 @@ +/** + * Module for matching adapters to files based on file name or mime type + */ +use crate::adapters::*; + +use failure::*; + +use regex::{Regex, RegexSet}; + + + +use std::iter::Iterator; + +use std::rc::Rc; + +#[derive(Clone)] +pub enum FastMatcher { + // MimeType(Regex), + /** + * without the leading dot, e.g. "jpg" or "tar.gz". Matched as /.*\.ext$/ + * + */ + FileExtension(String), + // todo: maybe add others, e.g. regex on whole filename or even paths + // todo: maybe allow matching a directory (e.g. /var/lib/postgres) +} + +#[derive(Clone)] +pub enum SlowMatcher { + /// any type of fast matcher + Fast(FastMatcher), + /// + /// match by exact mime type extracted using tree_magic + /// TODO: allow match ignoring suffix etc? + MimeType(String), +} + +pub struct FileMeta { + // filename is not actually a utf8 string, but since we can't do regex on OsStr and can't get a &[u8] from OsStr either, + // and since we probably only want to do only matching on ascii stuff anyways, this is the filename as a string with non-valid bytes removed + pub lossy_filename: String, + // only given when slow matching is enabled + pub mimetype: Option, +} + +pub fn extension_to_regex(extension: &str) -> Regex { + Regex::new(&format!(".*\\.{}", ®ex::escape(extension))).expect("we know this regex compiles") +} + +pub fn adapter_matcher>( + adapter_names: &[T], + slow: bool, +) -> Fallible Option>> { + let adapters = get_adapters_filtered(adapter_names)?; + // need order later + let adapter_names: Vec = adapters.iter().map(|e| e.metadata().name.clone()).collect(); + let mut fname_regexes = vec![]; + let mut mime_regexes = vec![]; + for adapter in adapters.into_iter() { + let metadata = adapter.metadata(); + use SlowMatcher::*; + for matcher in metadata.get_matchers(slow) { + match matcher.as_ref() { + MimeType(re) => mime_regexes.push((re.clone(), adapter.clone())), + Fast(FastMatcher::FileExtension(re)) => { + fname_regexes.push((extension_to_regex(re), adapter.clone())) + } + }; + } + } + let fname_regex_set = RegexSet::new(fname_regexes.iter().map(|p| p.0.as_str()))?; + let mime_regex_set = RegexSet::new(mime_regexes.iter().map(|p| p.0.as_str()))?; + Ok(move |meta: FileMeta| { + let fname_matches: Vec<_> = fname_regex_set + .matches(&meta.lossy_filename) + .into_iter() + .collect(); + let mime_matches: Vec<_> = if slow { + mime_regex_set + .matches(&meta.mimetype.expect("No mimetype?")) + .into_iter() + .collect() + } else { + vec![] + }; + if fname_matches.len() + mime_matches.len() > 1 { + // get first according to original priority list... + let fa = fname_matches.iter().map(|e| fname_regexes[*e].1.clone()); + let fb = mime_matches.iter().map(|e| mime_regexes[*e].1.clone()); + let mut v = vec![]; + v.extend(fa); + v.extend(fb); + v.sort_by_key(|e| { + (adapter_names + .iter() + .position(|r| r == &e.metadata().name) + .expect("impossib7")) + }); + eprintln!( + "Warning: found multiple adapters for {}:", + meta.lossy_filename + ); + for mmatch in v.iter() { + eprintln!(" - {}", mmatch.metadata().name); + } + return Some(v[0].clone()); + } + if mime_matches.is_empty() { + if fname_matches.is_empty() { + None + } else { + Some(fname_regexes[fname_matches[0]].1.clone()) + } + } else { + Some(mime_regexes[mime_matches[0]].1.clone()) + } + }) +} diff --git a/src/preproc.rs b/src/preproc.rs index 5fb5880..0d2e5fe 100644 --- a/src/preproc.rs +++ b/src/preproc.rs @@ -1,5 +1,6 @@ use crate::adapters::*; use crate::args::RgaArgs; +use crate::matching::*; use crate::CachingWriter; use failure::Fallible; use failure::{format_err, Error};