pass detection reason to adapter

This commit is contained in:
phiresky 2019-06-16 11:37:27 +02:00
parent d8b57f2f8a
commit 21f5178d15
10 changed files with 59 additions and 42 deletions

View File

@ -53,7 +53,10 @@ pub trait GetMetadata {
fn metadata(&self) -> &AdapterMeta; fn metadata(&self) -> &AdapterMeta;
} }
pub trait FileAdapter: GetMetadata { pub trait FileAdapter: GetMetadata {
fn adapt(&self, a: AdaptInfo) -> Fallible<()>; /// adapt a file.
///
/// detection_reason is the Matcher that was used to identify this file. Unless --rga-accurate was given, it is always a FastMatcher
fn adapt(&self, a: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()>;
} }
pub struct AdaptInfo<'a> { pub struct AdaptInfo<'a> {
/// file path. May not be an actual file on the file system (e.g. in an archive). Used for matching file extensions. /// file path. May not be an actual file on the file system (e.g. in an archive). Used for matching file extensions.
@ -72,7 +75,10 @@ pub struct AdaptInfo<'a> {
pub config: PreprocConfig<'a>, pub config: PreprocConfig<'a>,
} }
pub fn get_all_adapters() -> (Vec<Rc<dyn FileAdapter>>, Vec<Rc<dyn FileAdapter>>) { /// (enabledAdapters, disabledAdapters)
type AdaptersTuple = (Vec<Rc<dyn FileAdapter>>, Vec<Rc<dyn FileAdapter>>);
pub fn get_all_adapters() -> AdaptersTuple {
// order in descending priority // order in descending priority
let enabled_adapters: Vec<Rc<dyn FileAdapter>> = vec![ let enabled_adapters: Vec<Rc<dyn FileAdapter>> = vec![
Rc::new(ffmpeg::FFmpegAdapter::new()), Rc::new(ffmpeg::FFmpegAdapter::new()),
@ -96,7 +102,7 @@ pub fn get_all_adapters() -> (Vec<Rc<dyn FileAdapter>>, Vec<Rc<dyn FileAdapter>>
* - "" means use default enabled adapter list * - "" means use default enabled adapter list
* - "a,b" means use adapters a,b * - "a,b" means use adapters a,b
* - "-a,b" means use default list except for a and b * - "-a,b" means use default list except for a and b
* - "+a,b" means use default list but also a and b * - "+a,b" means use default list but also a and b (a,b will be prepended to the list so given higher priority)
*/ */
pub fn get_adapters_filtered<T: AsRef<str>>( pub fn get_adapters_filtered<T: AsRef<str>>(
adapter_names: &[T], adapter_names: &[T],

View File

@ -79,7 +79,7 @@ fn get_inner_filename(filename: &Path) -> PathBuf {
} }
impl FileAdapter for DecompressAdapter { impl FileAdapter for DecompressAdapter {
fn adapt(&self, ai: AdaptInfo) -> Fallible<()> { fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> {
let AdaptInfo { let AdaptInfo {
filepath_hint, filepath_hint,
mut inp, mut inp,

View File

@ -47,7 +47,7 @@ struct FFprobeStream {
codec_type: String, // video,audio,subtitle codec_type: String, // video,audio,subtitle
} }
impl FileAdapter for FFmpegAdapter { impl FileAdapter for FFmpegAdapter {
fn adapt(&self, ai: AdaptInfo) -> Fallible<()> { fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> {
let AdaptInfo { let AdaptInfo {
is_real_file, is_real_file,
filepath_hint, filepath_hint,

View File

@ -42,7 +42,7 @@ impl GetMetadata for PdfPagesAdapter {
/// A pdf is basically converted to a zip that has Page X.png files. /// A pdf is basically converted to a zip that has Page X.png files.
/// This way, something like tesseract can process the pages individually /// This way, something like tesseract can process the pages individually
impl FileAdapter for PdfPagesAdapter { impl FileAdapter for PdfPagesAdapter {
fn adapt(&self, ai: AdaptInfo) -> Fallible<()> { fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> {
let AdaptInfo { let AdaptInfo {
filepath_hint, filepath_hint,
is_real_file, is_real_file,

View File

@ -93,7 +93,7 @@ impl<T> FileAdapter for T
where where
T: SpawningFileAdapter, T: SpawningFileAdapter,
{ {
fn adapt(&self, ai: AdaptInfo) -> Fallible<()> { fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> {
let AdaptInfo { let AdaptInfo {
filepath_hint, filepath_hint,
mut inp, mut inp,

View File

@ -56,7 +56,7 @@ fn format_blob(b: ValueRef) -> String {
} }
impl FileAdapter for SqliteAdapter { impl FileAdapter for SqliteAdapter {
fn adapt(&self, ai: AdaptInfo) -> Fallible<()> { fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> {
let AdaptInfo { let AdaptInfo {
is_real_file, is_real_file,
filepath_hint, filepath_hint,

View File

@ -35,7 +35,7 @@ impl GetMetadata for TarAdapter {
} }
impl FileAdapter for TarAdapter { impl FileAdapter for TarAdapter {
fn adapt(&self, ai: AdaptInfo) -> Fallible<()> { fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> {
let AdaptInfo { let AdaptInfo {
filepath_hint, filepath_hint,
mut inp, mut inp,

View File

@ -45,7 +45,7 @@ fn is_dir(f: &ZipFile) -> bool {
} }
impl FileAdapter for ZipAdapter { impl FileAdapter for ZipAdapter {
fn adapt(&self, ai: AdaptInfo) -> Fallible<()> { fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> {
let AdaptInfo { let AdaptInfo {
filepath_hint, filepath_hint,
mut inp, mut inp,

View File

@ -7,8 +7,6 @@ use failure::*;
use regex::{Regex, RegexSet}; use regex::{Regex, RegexSet};
use std::iter::Iterator; use std::iter::Iterator;
use std::rc::Rc; use std::rc::Rc;
@ -50,7 +48,7 @@ pub fn extension_to_regex(extension: &str) -> Regex {
pub fn adapter_matcher<T: AsRef<str>>( pub fn adapter_matcher<T: AsRef<str>>(
adapter_names: &[T], adapter_names: &[T],
slow: bool, slow: bool,
) -> Fallible<impl Fn(FileMeta) -> Option<Rc<dyn FileAdapter>>> { ) -> Fallible<impl Fn(FileMeta) -> Option<(Rc<dyn FileAdapter>, SlowMatcher)>> {
let adapters = get_adapters_filtered(adapter_names)?; let adapters = get_adapters_filtered(adapter_names)?;
// need order later // need order later
let adapter_names: Vec<String> = adapters.iter().map(|e| e.metadata().name.clone()).collect(); let adapter_names: Vec<String> = adapters.iter().map(|e| e.metadata().name.clone()).collect();
@ -61,9 +59,9 @@ pub fn adapter_matcher<T: AsRef<str>>(
use SlowMatcher::*; use SlowMatcher::*;
for matcher in metadata.get_matchers(slow) { for matcher in metadata.get_matchers(slow) {
match matcher.as_ref() { match matcher.as_ref() {
MimeType(re) => mime_regexes.push((re.clone(), adapter.clone())), f @ MimeType(re) => mime_regexes.push((re.clone(), adapter.clone(), f)),
Fast(FastMatcher::FileExtension(re)) => { f @ Fast(FastMatcher::FileExtension(re)) => {
fname_regexes.push((extension_to_regex(re), adapter.clone())) fname_regexes.push((extension_to_regex(re), adapter.clone(), f))
} }
}; };
} }
@ -85,15 +83,20 @@ pub fn adapter_matcher<T: AsRef<str>>(
}; };
if fname_matches.len() + mime_matches.len() > 1 { if fname_matches.len() + mime_matches.len() > 1 {
// get first according to original priority list... // get first according to original priority list...
let fa = fname_matches.iter().map(|e| fname_regexes[*e].1.clone()); // todo: kinda ugly
let fb = mime_matches.iter().map(|e| mime_regexes[*e].1.clone()); let fa = fname_matches
.iter()
.map(|e| (fname_regexes[*e].1.clone(), fname_regexes[*e].2.clone()));
let fb = mime_matches
.iter()
.map(|e| (mime_regexes[*e].1.clone(), mime_regexes[*e].2.clone()));
let mut v = vec![]; let mut v = vec![];
v.extend(fa); v.extend(fa);
v.extend(fb); v.extend(fb);
v.sort_by_key(|e| { v.sort_by_key(|e| {
(adapter_names (adapter_names
.iter() .iter()
.position(|r| r == &e.metadata().name) .position(|r| r == &e.0.metadata().name)
.expect("impossib7")) .expect("impossib7"))
}); });
eprintln!( eprintln!(
@ -101,7 +104,7 @@ pub fn adapter_matcher<T: AsRef<str>>(
meta.lossy_filename meta.lossy_filename
); );
for mmatch in v.iter() { for mmatch in v.iter() {
eprintln!(" - {}", mmatch.metadata().name); eprintln!(" - {}", mmatch.0.metadata().name);
} }
return Some(v[0].clone()); return Some(v[0].clone());
} }
@ -109,10 +112,12 @@ pub fn adapter_matcher<T: AsRef<str>>(
if fname_matches.is_empty() { if fname_matches.is_empty() {
None None
} else { } else {
Some(fname_regexes[fname_matches[0]].1.clone()) let (_, adapter, matcher) = fname_regexes[fname_matches[0]];
Some((adapter.clone(), matcher.clone()))
} }
} else { } else {
Some(mime_regexes[mime_matches[0]].1.clone()) let (_, adapter, matcher) = mime_regexes[mime_matches[0]];
Some((adapter.clone(), matcher.clone()))
} }
}) })
} }

View File

@ -63,8 +63,8 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> {
lossy_filename: filename.to_string_lossy().to_string(), lossy_filename: filename.to_string_lossy().to_string(),
}); });
match adapter { match adapter {
Some(ad) => { Some((adapter, detection_reason)) => {
let meta = ad.metadata(); let meta = adapter.metadata();
eprintln!("adapter: {}", &meta.name); eprintln!("adapter: {}", &meta.name);
let db_name = format!("{}.v{}", meta.name, meta.version); let db_name = format!("{}.v{}", meta.name, meta.version);
if let Some(cache) = cache.as_mut() { if let Some(cache) = cache.as_mut() {
@ -91,15 +91,18 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> {
args.cache_compression_level.try_into().unwrap(), args.cache_compression_level.try_into().unwrap(),
)?); )?);
eprintln!("adapting..."); eprintln!("adapting...");
ad.adapt(AdaptInfo { adapter.adapt(
line_prefix, AdaptInfo {
filepath_hint, line_prefix,
is_real_file, filepath_hint,
inp, is_real_file,
oup: &mut compbuf, inp,
archive_recursion_depth, oup: &mut compbuf,
config: PreprocConfig { cache: None, args }, archive_recursion_depth,
})?; config: PreprocConfig { cache: None, args },
},
detection_reason,
)?;
let compressed = compbuf let compressed = compbuf
.into_inner() .into_inner()
.map_err(|_| "could not finish zstd") .map_err(|_| "could not finish zstd")
@ -121,15 +124,18 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> {
Ok(()) Ok(())
} else { } else {
eprintln!("adapting..."); eprintln!("adapting...");
ad.adapt(AdaptInfo { adapter.adapt(
line_prefix, AdaptInfo {
filepath_hint, line_prefix,
is_real_file, filepath_hint,
inp, is_real_file,
oup, inp,
archive_recursion_depth, oup,
config: PreprocConfig { cache: None, args }, archive_recursion_depth,
})?; config: PreprocConfig { cache: None, args },
},
detection_reason,
)?;
Ok(()) Ok(())
} }
} }