diff --git a/src/adapters.rs b/src/adapters.rs index c2c4729..30af5ca 100644 --- a/src/adapters.rs +++ b/src/adapters.rs @@ -53,7 +53,10 @@ pub trait GetMetadata { fn metadata(&self) -> &AdapterMeta; } pub trait FileAdapter: GetMetadata { - fn adapt(&self, a: AdaptInfo) -> Fallible<()>; + /// adapt a file. + /// + /// detection_reason is the Matcher that was used to identify this file. Unless --rga-accurate was given, it is always a FastMatcher + fn adapt(&self, a: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()>; } pub struct AdaptInfo<'a> { /// file path. May not be an actual file on the file system (e.g. in an archive). Used for matching file extensions. @@ -72,7 +75,10 @@ pub struct AdaptInfo<'a> { pub config: PreprocConfig<'a>, } -pub fn get_all_adapters() -> (Vec>, Vec>) { +/// (enabledAdapters, disabledAdapters) +type AdaptersTuple = (Vec>, Vec>); + +pub fn get_all_adapters() -> AdaptersTuple { // order in descending priority let enabled_adapters: Vec> = vec![ Rc::new(ffmpeg::FFmpegAdapter::new()), @@ -96,7 +102,7 @@ pub fn get_all_adapters() -> (Vec>, Vec> * - "" means use default enabled adapter list * - "a,b" means use adapters a,b * - "-a,b" means use default list except for a and b - * - "+a,b" means use default list but also a and b + * - "+a,b" means use default list but also a and b (a,b will be prepended to the list so given higher priority) */ pub fn get_adapters_filtered>( adapter_names: &[T], diff --git a/src/adapters/decompress.rs b/src/adapters/decompress.rs index 0902d66..141f0da 100644 --- a/src/adapters/decompress.rs +++ b/src/adapters/decompress.rs @@ -79,7 +79,7 @@ fn get_inner_filename(filename: &Path) -> PathBuf { } impl FileAdapter for DecompressAdapter { - fn adapt(&self, ai: AdaptInfo) -> Fallible<()> { + fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> { let AdaptInfo { filepath_hint, mut inp, diff --git a/src/adapters/ffmpeg.rs b/src/adapters/ffmpeg.rs index 8ccc89d..3649e59 100644 --- a/src/adapters/ffmpeg.rs +++ b/src/adapters/ffmpeg.rs @@ -47,7 +47,7 @@ struct FFprobeStream { codec_type: String, // video,audio,subtitle } impl FileAdapter for FFmpegAdapter { - fn adapt(&self, ai: AdaptInfo) -> Fallible<()> { + fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> { let AdaptInfo { is_real_file, filepath_hint, diff --git a/src/adapters/pdfpages.rs b/src/adapters/pdfpages.rs index 13d7bbc..346ea6f 100644 --- a/src/adapters/pdfpages.rs +++ b/src/adapters/pdfpages.rs @@ -42,7 +42,7 @@ impl GetMetadata for PdfPagesAdapter { /// A pdf is basically converted to a zip that has Page X.png files. /// This way, something like tesseract can process the pages individually impl FileAdapter for PdfPagesAdapter { - fn adapt(&self, ai: AdaptInfo) -> Fallible<()> { + fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> { let AdaptInfo { filepath_hint, is_real_file, diff --git a/src/adapters/spawning.rs b/src/adapters/spawning.rs index 726617a..8c446f3 100644 --- a/src/adapters/spawning.rs +++ b/src/adapters/spawning.rs @@ -93,7 +93,7 @@ impl FileAdapter for T where T: SpawningFileAdapter, { - fn adapt(&self, ai: AdaptInfo) -> Fallible<()> { + fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> { let AdaptInfo { filepath_hint, mut inp, diff --git a/src/adapters/sqlite.rs b/src/adapters/sqlite.rs index 74cbe0c..ba9d66f 100644 --- a/src/adapters/sqlite.rs +++ b/src/adapters/sqlite.rs @@ -56,7 +56,7 @@ fn format_blob(b: ValueRef) -> String { } impl FileAdapter for SqliteAdapter { - fn adapt(&self, ai: AdaptInfo) -> Fallible<()> { + fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> { let AdaptInfo { is_real_file, filepath_hint, diff --git a/src/adapters/tar.rs b/src/adapters/tar.rs index acd03d5..d490d00 100644 --- a/src/adapters/tar.rs +++ b/src/adapters/tar.rs @@ -35,7 +35,7 @@ impl GetMetadata for TarAdapter { } impl FileAdapter for TarAdapter { - fn adapt(&self, ai: AdaptInfo) -> Fallible<()> { + fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> { let AdaptInfo { filepath_hint, mut inp, diff --git a/src/adapters/zip.rs b/src/adapters/zip.rs index 8e6befd..e9bcba5 100644 --- a/src/adapters/zip.rs +++ b/src/adapters/zip.rs @@ -45,7 +45,7 @@ fn is_dir(f: &ZipFile) -> bool { } impl FileAdapter for ZipAdapter { - fn adapt(&self, ai: AdaptInfo) -> Fallible<()> { + fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> { let AdaptInfo { filepath_hint, mut inp, diff --git a/src/matching.rs b/src/matching.rs index 2e10aa7..22b84c7 100644 --- a/src/matching.rs +++ b/src/matching.rs @@ -7,8 +7,6 @@ use failure::*; use regex::{Regex, RegexSet}; - - use std::iter::Iterator; use std::rc::Rc; @@ -50,7 +48,7 @@ pub fn extension_to_regex(extension: &str) -> Regex { pub fn adapter_matcher>( adapter_names: &[T], slow: bool, -) -> Fallible Option>> { +) -> Fallible Option<(Rc, SlowMatcher)>> { let adapters = get_adapters_filtered(adapter_names)?; // need order later let adapter_names: Vec = adapters.iter().map(|e| e.metadata().name.clone()).collect(); @@ -61,9 +59,9 @@ pub fn adapter_matcher>( use SlowMatcher::*; for matcher in metadata.get_matchers(slow) { match matcher.as_ref() { - MimeType(re) => mime_regexes.push((re.clone(), adapter.clone())), - Fast(FastMatcher::FileExtension(re)) => { - fname_regexes.push((extension_to_regex(re), adapter.clone())) + f @ MimeType(re) => mime_regexes.push((re.clone(), adapter.clone(), f)), + f @ Fast(FastMatcher::FileExtension(re)) => { + fname_regexes.push((extension_to_regex(re), adapter.clone(), f)) } }; } @@ -85,15 +83,20 @@ pub fn adapter_matcher>( }; if fname_matches.len() + mime_matches.len() > 1 { // get first according to original priority list... - let fa = fname_matches.iter().map(|e| fname_regexes[*e].1.clone()); - let fb = mime_matches.iter().map(|e| mime_regexes[*e].1.clone()); + // todo: kinda ugly + let fa = fname_matches + .iter() + .map(|e| (fname_regexes[*e].1.clone(), fname_regexes[*e].2.clone())); + let fb = mime_matches + .iter() + .map(|e| (mime_regexes[*e].1.clone(), mime_regexes[*e].2.clone())); let mut v = vec![]; v.extend(fa); v.extend(fb); v.sort_by_key(|e| { (adapter_names .iter() - .position(|r| r == &e.metadata().name) + .position(|r| r == &e.0.metadata().name) .expect("impossib7")) }); eprintln!( @@ -101,7 +104,7 @@ pub fn adapter_matcher>( meta.lossy_filename ); for mmatch in v.iter() { - eprintln!(" - {}", mmatch.metadata().name); + eprintln!(" - {}", mmatch.0.metadata().name); } return Some(v[0].clone()); } @@ -109,10 +112,12 @@ pub fn adapter_matcher>( if fname_matches.is_empty() { None } else { - Some(fname_regexes[fname_matches[0]].1.clone()) + let (_, adapter, matcher) = fname_regexes[fname_matches[0]]; + Some((adapter.clone(), matcher.clone())) } } else { - Some(mime_regexes[mime_matches[0]].1.clone()) + let (_, adapter, matcher) = mime_regexes[mime_matches[0]]; + Some((adapter.clone(), matcher.clone())) } }) } diff --git a/src/preproc.rs b/src/preproc.rs index 9cb7b2e..a29bd4c 100644 --- a/src/preproc.rs +++ b/src/preproc.rs @@ -63,8 +63,8 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> { lossy_filename: filename.to_string_lossy().to_string(), }); match adapter { - Some(ad) => { - let meta = ad.metadata(); + Some((adapter, detection_reason)) => { + let meta = adapter.metadata(); eprintln!("adapter: {}", &meta.name); let db_name = format!("{}.v{}", meta.name, meta.version); if let Some(cache) = cache.as_mut() { @@ -91,15 +91,18 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> { args.cache_compression_level.try_into().unwrap(), )?); eprintln!("adapting..."); - ad.adapt(AdaptInfo { - line_prefix, - filepath_hint, - is_real_file, - inp, - oup: &mut compbuf, - archive_recursion_depth, - config: PreprocConfig { cache: None, args }, - })?; + adapter.adapt( + AdaptInfo { + line_prefix, + filepath_hint, + is_real_file, + inp, + oup: &mut compbuf, + archive_recursion_depth, + config: PreprocConfig { cache: None, args }, + }, + detection_reason, + )?; let compressed = compbuf .into_inner() .map_err(|_| "could not finish zstd") @@ -121,15 +124,18 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> { Ok(()) } else { eprintln!("adapting..."); - ad.adapt(AdaptInfo { - line_prefix, - filepath_hint, - is_real_file, - inp, - oup, - archive_recursion_depth, - config: PreprocConfig { cache: None, args }, - })?; + adapter.adapt( + AdaptInfo { + line_prefix, + filepath_hint, + is_real_file, + inp, + oup, + archive_recursion_depth, + config: PreprocConfig { cache: None, args }, + }, + detection_reason, + )?; Ok(()) } }