2020-09-30 14:49:51 +00:00
|
|
|
pub mod custom;
|
2020-09-28 20:55:55 +00:00
|
|
|
// pub mod decompress;
|
|
|
|
// pub mod ffmpeg;
|
2022-10-29 18:54:05 +00:00
|
|
|
// pub mod postproc;
|
2020-09-10 15:18:11 +00:00
|
|
|
// pub mod pdfpages;
|
2020-09-30 14:49:51 +00:00
|
|
|
pub mod spawning;
|
2022-11-04 23:47:43 +00:00
|
|
|
use std::sync::Arc;
|
2020-09-28 20:55:55 +00:00
|
|
|
// pub mod sqlite;
|
2020-09-10 15:18:11 +00:00
|
|
|
// pub mod tar;
|
|
|
|
// pub mod tesseract;
|
2020-09-28 20:55:55 +00:00
|
|
|
// pub mod writing;
|
2022-10-29 18:54:05 +00:00
|
|
|
// pub mod zip;
|
2020-09-30 15:26:42 +00:00
|
|
|
use crate::{adapted_iter::AdaptedFilesIterBox, config::RgaConfig, matching::*};
|
2020-06-06 10:57:43 +00:00
|
|
|
use anyhow::*;
|
2021-08-26 14:00:27 +00:00
|
|
|
use custom::builtin_spawning_adapters;
|
2020-09-30 14:49:51 +00:00
|
|
|
use custom::CustomAdapterConfig;
|
2019-06-07 22:04:48 +00:00
|
|
|
use log::*;
|
2022-10-29 18:54:05 +00:00
|
|
|
use tokio::io::AsyncRead;
|
2020-06-11 21:09:31 +00:00
|
|
|
|
2019-06-11 11:34:04 +00:00
|
|
|
use std::borrow::Cow;
|
2019-06-07 22:04:48 +00:00
|
|
|
use std::collections::HashMap;
|
2019-06-11 11:34:04 +00:00
|
|
|
use std::iter::Iterator;
|
2020-09-30 14:22:54 +00:00
|
|
|
use std::path::PathBuf;
|
2022-10-29 18:54:05 +00:00
|
|
|
use std::pin::Pin;
|
2019-06-04 18:08:26 +00:00
|
|
|
use std::rc::Rc;
|
2022-11-04 23:47:43 +00:00
|
|
|
use core::fmt::Debug;
|
2019-06-04 18:08:26 +00:00
|
|
|
|
2022-10-29 21:56:25 +00:00
|
|
|
pub type ReadBox = Pin<Box<dyn AsyncRead + Send>>;
|
2019-06-04 18:08:26 +00:00
|
|
|
pub struct AdapterMeta {
|
2019-06-07 22:04:48 +00:00
|
|
|
/// unique short name of this adapter (a-z0-9 only)
|
2019-06-04 18:08:26 +00:00
|
|
|
pub name: String,
|
2019-06-07 22:04:48 +00:00
|
|
|
/// version identifier. used to key cache entries, change if your output format changes
|
2019-06-04 18:08:26 +00:00
|
|
|
pub version: i32,
|
2019-06-07 22:04:48 +00:00
|
|
|
pub description: String,
|
2019-06-16 10:19:01 +00:00
|
|
|
/// indicates whether this adapter can descend (=call rga_preproc again). if true, the cache key needs to include the list of active adapters
|
|
|
|
pub recurses: bool,
|
|
|
|
/// list of matchers (interpreted as a OR b OR ...)
|
2020-06-17 09:45:06 +00:00
|
|
|
pub fast_matchers: Vec<FastFileMatcher>,
|
2019-06-11 11:34:04 +00:00
|
|
|
/// list of matchers when we have mime type detection active (interpreted as ORed)
|
|
|
|
/// warning: this *overrides* the fast matchers
|
2020-06-17 09:45:06 +00:00
|
|
|
pub slow_matchers: Option<Vec<FileMatcher>>,
|
2020-09-10 15:18:11 +00:00
|
|
|
/// if true, slow_matchers is merged with fast matchers if accurate is enabled
|
|
|
|
/// for example, in sqlite you want this disabled since the db extension can mean other things and the mime type matching is very accurate for sqlite.
|
|
|
|
/// but for tar you want it enabled, since the tar extension is very accurate but the tar mime matcher can have false negatives
|
|
|
|
pub keep_fast_matchers_if_accurate: bool,
|
2020-06-09 16:27:22 +00:00
|
|
|
// if true, adapter is only used when user lists it in `--rga-adapters`
|
|
|
|
pub disabled_by_default: bool,
|
2019-06-11 11:34:04 +00:00
|
|
|
}
|
|
|
|
impl AdapterMeta {
|
|
|
|
// todo: this is pretty ugly
|
2019-06-12 10:25:02 +00:00
|
|
|
pub fn get_matchers<'a>(
|
|
|
|
&'a self,
|
|
|
|
slow: bool,
|
2020-06-17 09:45:06 +00:00
|
|
|
) -> Box<dyn Iterator<Item = Cow<FileMatcher>> + 'a> {
|
2020-09-10 15:18:11 +00:00
|
|
|
match (
|
|
|
|
slow,
|
|
|
|
self.keep_fast_matchers_if_accurate,
|
|
|
|
&self.slow_matchers,
|
|
|
|
) {
|
|
|
|
(true, false, Some(ref sm)) => Box::new(sm.iter().map(|e| Cow::Borrowed(e))),
|
|
|
|
(true, true, Some(ref sm)) => Box::new(
|
|
|
|
sm.iter().map(|e| Cow::Borrowed(e)).chain(
|
|
|
|
self.fast_matchers
|
|
|
|
.iter()
|
|
|
|
.map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))),
|
|
|
|
),
|
|
|
|
),
|
|
|
|
// don't have slow matchers or slow matching disabled
|
|
|
|
(true, _, None) | (false, _, _) => Box::new(
|
2019-06-11 11:34:04 +00:00
|
|
|
self.fast_matchers
|
|
|
|
.iter()
|
2020-06-17 09:45:06 +00:00
|
|
|
.map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))),
|
2019-06-11 11:34:04 +00:00
|
|
|
),
|
|
|
|
}
|
|
|
|
}
|
2019-06-04 18:08:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
pub trait GetMetadata {
|
2019-06-06 21:43:30 +00:00
|
|
|
fn metadata(&self) -> &AdapterMeta;
|
2019-06-04 18:08:26 +00:00
|
|
|
}
|
2022-11-04 23:47:43 +00:00
|
|
|
pub trait FileAdapter: GetMetadata + Send + Sync{
|
2019-06-16 09:37:27 +00:00
|
|
|
/// adapt a file.
|
|
|
|
///
|
|
|
|
/// detection_reason is the Matcher that was used to identify this file. Unless --rga-accurate was given, it is always a FastMatcher
|
2020-09-28 20:55:55 +00:00
|
|
|
fn adapt<'a>(
|
|
|
|
&self,
|
2022-10-29 21:56:25 +00:00
|
|
|
a: AdaptInfo,
|
2020-09-28 20:55:55 +00:00
|
|
|
detection_reason: &FileMatcher,
|
2022-10-29 21:56:25 +00:00
|
|
|
) -> Result<AdaptedFilesIterBox>;
|
2020-09-30 14:22:54 +00:00
|
|
|
}
|
|
|
|
|
2022-10-29 21:56:25 +00:00
|
|
|
pub struct AdaptInfo {
|
2019-06-06 21:43:30 +00:00
|
|
|
/// file path. May not be an actual file on the file system (e.g. in an archive). Used for matching file extensions.
|
2020-06-11 21:09:31 +00:00
|
|
|
pub filepath_hint: PathBuf,
|
2019-06-06 21:43:30 +00:00
|
|
|
/// true if filepath_hint is an actual file on the file system
|
|
|
|
pub is_real_file: bool,
|
2019-06-07 13:43:19 +00:00
|
|
|
/// depth at which this file is in archives. 0 for real filesystem
|
|
|
|
pub archive_recursion_depth: i32,
|
2019-06-06 21:50:58 +00:00
|
|
|
/// stream to read the file from. can be from a file or from some decoder
|
2022-10-29 21:56:25 +00:00
|
|
|
pub inp: ReadBox,
|
2019-06-06 21:50:58 +00:00
|
|
|
/// prefix every output line with this string to better indicate the file's location if it is in some archive
|
2020-06-11 21:09:31 +00:00
|
|
|
pub line_prefix: String,
|
2020-09-30 14:22:54 +00:00
|
|
|
pub postprocess: bool,
|
2022-11-04 23:47:43 +00:00
|
|
|
pub config: RgaConfig,
|
2019-06-04 18:08:26 +00:00
|
|
|
}
|
|
|
|
|
2019-06-16 09:37:27 +00:00
|
|
|
/// (enabledAdapters, disabledAdapters)
|
2022-11-04 23:47:43 +00:00
|
|
|
type AdaptersTuple = (Vec<Arc<dyn FileAdapter>>, Vec<Arc<dyn FileAdapter>>);
|
2019-06-16 09:37:27 +00:00
|
|
|
|
2020-09-30 14:49:51 +00:00
|
|
|
pub fn get_all_adapters(custom_adapters: Option<Vec<CustomAdapterConfig>>) -> AdaptersTuple {
|
2019-06-11 12:40:58 +00:00
|
|
|
// order in descending priority
|
2022-11-04 23:47:43 +00:00
|
|
|
let mut adapters: Vec<Arc<dyn FileAdapter>> = vec![];
|
2020-09-30 14:49:51 +00:00
|
|
|
if let Some(custom_adapters) = custom_adapters {
|
2020-06-08 23:45:52 +00:00
|
|
|
for adapter_config in custom_adapters {
|
2022-11-04 23:47:43 +00:00
|
|
|
adapters.push(Arc::new(adapter_config.to_adapter()));
|
2020-06-08 23:45:52 +00:00
|
|
|
}
|
2020-09-30 14:49:51 +00:00
|
|
|
}
|
2020-06-08 23:45:52 +00:00
|
|
|
|
2022-11-04 23:47:43 +00:00
|
|
|
let internal_adapters: Vec<Arc<dyn FileAdapter>> = vec![
|
2020-09-28 20:55:55 +00:00
|
|
|
//Rc::new(ffmpeg::FFmpegAdapter::new()),
|
2022-10-29 18:54:05 +00:00
|
|
|
// Rc::new(zip::ZipAdapter::new()),
|
2020-09-28 20:55:55 +00:00
|
|
|
//Rc::new(decompress::DecompressAdapter::new()),
|
2020-06-11 21:09:31 +00:00
|
|
|
// Rc::new(tar::TarAdapter::new()),
|
2020-09-28 20:55:55 +00:00
|
|
|
//Rc::new(sqlite::SqliteAdapter::new()),
|
2020-06-11 21:09:31 +00:00
|
|
|
// Rc::new(pdfpages::PdfPagesAdapter::new()),
|
2020-09-10 15:18:11 +00:00
|
|
|
// Rc::new(tesseract::TesseractAdapter::new()),
|
2019-06-12 10:39:27 +00:00
|
|
|
];
|
2021-08-26 14:00:27 +00:00
|
|
|
adapters.extend(
|
2020-06-09 16:27:22 +00:00
|
|
|
builtin_spawning_adapters
|
|
|
|
.iter()
|
2022-11-04 23:47:43 +00:00
|
|
|
.map(|e| -> Arc<dyn FileAdapter> { Arc::new(e.to_adapter()) }),
|
2021-08-26 14:00:27 +00:00
|
|
|
);
|
2020-06-09 16:27:22 +00:00
|
|
|
adapters.extend(internal_adapters);
|
|
|
|
|
|
|
|
adapters
|
|
|
|
.into_iter()
|
|
|
|
.partition(|e| !e.metadata().disabled_by_default)
|
2019-06-05 19:28:35 +00:00
|
|
|
}
|
2019-06-04 18:08:26 +00:00
|
|
|
|
2019-06-12 10:39:27 +00:00
|
|
|
/**
|
|
|
|
* filter adapters by given names:
|
|
|
|
*
|
|
|
|
* - "" means use default enabled adapter list
|
2019-06-12 15:23:30 +00:00
|
|
|
* - "a,b" means use adapters a,b
|
2019-06-12 10:39:27 +00:00
|
|
|
* - "-a,b" means use default list except for a and b
|
2019-06-16 09:37:27 +00:00
|
|
|
* - "+a,b" means use default list but also a and b (a,b will be prepended to the list so given higher priority)
|
2019-06-12 10:39:27 +00:00
|
|
|
*/
|
2019-06-11 11:34:04 +00:00
|
|
|
pub fn get_adapters_filtered<T: AsRef<str>>(
|
2020-09-30 14:49:51 +00:00
|
|
|
custom_adapters: Option<Vec<CustomAdapterConfig>>,
|
2020-06-08 23:45:52 +00:00
|
|
|
adapter_names: &Vec<T>,
|
2022-11-04 23:47:43 +00:00
|
|
|
) -> Result<Vec<Arc<dyn FileAdapter>>> {
|
2020-09-30 14:49:51 +00:00
|
|
|
let (def_enabled_adapters, def_disabled_adapters) = get_all_adapters(custom_adapters);
|
2019-06-07 22:04:48 +00:00
|
|
|
let adapters = if !adapter_names.is_empty() {
|
2019-06-12 10:39:27 +00:00
|
|
|
let adapters_map: HashMap<_, _> = def_enabled_adapters
|
2019-06-07 22:04:48 +00:00
|
|
|
.iter()
|
2019-06-12 10:39:27 +00:00
|
|
|
.chain(def_disabled_adapters.iter())
|
2019-06-07 22:04:48 +00:00
|
|
|
.map(|e| (e.metadata().name.clone(), e.clone()))
|
|
|
|
.collect();
|
|
|
|
let mut adapters = vec![];
|
|
|
|
let mut subtractive = false;
|
2019-06-12 15:23:30 +00:00
|
|
|
let mut additive = false;
|
2019-06-07 22:04:48 +00:00
|
|
|
for (i, name) in adapter_names.iter().enumerate() {
|
2019-06-11 11:34:04 +00:00
|
|
|
let mut name = name.as_ref();
|
|
|
|
if i == 0 && (name.starts_with('-')) {
|
2019-06-07 22:04:48 +00:00
|
|
|
subtractive = true;
|
|
|
|
name = &name[1..];
|
2019-06-12 10:39:27 +00:00
|
|
|
adapters = def_enabled_adapters.clone();
|
|
|
|
} else if i == 0 && (name.starts_with('+')) {
|
|
|
|
name = &name[1..];
|
|
|
|
adapters = def_enabled_adapters.clone();
|
2019-06-12 15:23:30 +00:00
|
|
|
additive = true;
|
2019-06-07 22:04:48 +00:00
|
|
|
}
|
|
|
|
if subtractive {
|
|
|
|
let inx = adapters
|
|
|
|
.iter()
|
2019-06-11 11:34:04 +00:00
|
|
|
.position(|a| a.metadata().name == name)
|
2019-06-07 22:04:48 +00:00
|
|
|
.ok_or_else(|| format_err!("Could not remove {}: Not in list", name))?;
|
|
|
|
adapters.remove(inx);
|
|
|
|
} else {
|
2019-06-12 15:23:30 +00:00
|
|
|
let adapter = adapters_map
|
|
|
|
.get(name)
|
|
|
|
.ok_or_else(|| format_err!("Unknown adapter: \"{}\"", name))?
|
|
|
|
.clone();
|
|
|
|
if additive {
|
|
|
|
adapters.insert(0, adapter);
|
|
|
|
} else {
|
|
|
|
adapters.push(adapter);
|
|
|
|
}
|
2019-06-07 22:04:48 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
adapters
|
|
|
|
} else {
|
2019-06-12 10:39:27 +00:00
|
|
|
def_enabled_adapters
|
2019-06-07 22:04:48 +00:00
|
|
|
};
|
|
|
|
debug!(
|
2020-06-09 10:47:34 +00:00
|
|
|
"Chosen available adapters: {}",
|
2019-06-07 22:04:48 +00:00
|
|
|
adapters
|
|
|
|
.iter()
|
|
|
|
.map(|a| a.metadata().name.clone())
|
|
|
|
.collect::<Vec<String>>()
|
|
|
|
.join(",")
|
|
|
|
);
|
|
|
|
Ok(adapters)
|
|
|
|
}
|