2019-06-04 18:08:26 +00:00
|
|
|
pub mod ffmpeg;
|
|
|
|
pub mod pandoc;
|
|
|
|
pub mod poppler;
|
|
|
|
pub mod spawning;
|
2019-06-07 14:57:11 +00:00
|
|
|
pub mod sqlite;
|
2019-06-06 15:59:15 +00:00
|
|
|
pub mod tar;
|
2019-06-06 09:00:13 +00:00
|
|
|
pub mod zip;
|
2019-06-07 17:00:24 +00:00
|
|
|
use crate::preproc::PreprocConfig;
|
2019-06-06 09:00:13 +00:00
|
|
|
use failure::*;
|
2019-06-07 22:04:48 +00:00
|
|
|
use log::*;
|
2019-06-04 18:08:26 +00:00
|
|
|
use regex::{Regex, RegexSet};
|
2019-06-11 11:43:01 +00:00
|
|
|
|
2019-06-11 11:34:04 +00:00
|
|
|
use std::borrow::Cow;
|
2019-06-07 22:04:48 +00:00
|
|
|
use std::collections::HashMap;
|
2019-06-06 09:00:13 +00:00
|
|
|
use std::io::prelude::*;
|
2019-06-11 11:34:04 +00:00
|
|
|
use std::iter::Iterator;
|
2019-06-05 19:28:35 +00:00
|
|
|
use std::path::Path;
|
2019-06-04 18:08:26 +00:00
|
|
|
use std::rc::Rc;
|
|
|
|
|
2019-06-11 11:34:04 +00:00
|
|
|
#[derive(Clone)]
|
|
|
|
pub enum FastMatcher {
|
2019-06-05 14:43:40 +00:00
|
|
|
// MimeType(Regex),
|
2019-06-06 15:59:15 +00:00
|
|
|
/**
|
2019-06-11 11:34:04 +00:00
|
|
|
* without the leading dot, e.g. "jpg" or "tar.gz". Matched as /.*\.ext$/
|
2019-06-06 15:59:15 +00:00
|
|
|
*
|
|
|
|
*/
|
2019-06-05 19:28:35 +00:00
|
|
|
FileExtension(String),
|
2019-06-11 11:34:04 +00:00
|
|
|
// todo: maybe add others, e.g. regex on whole filename or even paths
|
|
|
|
// todo: maybe allow matching a directory (e.g. /var/lib/postgres)
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Clone)]
|
|
|
|
pub enum SlowMatcher {
|
|
|
|
/// any type of fast matcher
|
|
|
|
Fast(FastMatcher),
|
|
|
|
///
|
|
|
|
/// match by exact mime type extracted using tree_magic
|
|
|
|
/// TODO: allow match ignoring suffix etc?
|
|
|
|
MimeType(String),
|
2019-06-04 18:08:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
pub struct AdapterMeta {
|
2019-06-07 22:04:48 +00:00
|
|
|
/// unique short name of this adapter (a-z0-9 only)
|
2019-06-04 18:08:26 +00:00
|
|
|
pub name: String,
|
2019-06-07 22:04:48 +00:00
|
|
|
/// version identifier. used to key cache entries, change if your output format changes
|
2019-06-04 18:08:26 +00:00
|
|
|
pub version: i32,
|
2019-06-07 22:04:48 +00:00
|
|
|
pub description: String,
|
2019-06-11 11:34:04 +00:00
|
|
|
/// list of matchers (interpreted as ORed)
|
|
|
|
pub fast_matchers: Vec<FastMatcher>,
|
|
|
|
/// list of matchers when we have mime type detection active (interpreted as ORed)
|
|
|
|
/// warning: this *overrides* the fast matchers
|
|
|
|
pub slow_matchers: Option<Vec<SlowMatcher>>,
|
|
|
|
}
|
|
|
|
impl AdapterMeta {
|
|
|
|
// todo: this is pretty ugly
|
|
|
|
fn get_matchers<'a>(&'a self, slow: bool) -> Box<dyn Iterator<Item = Cow<SlowMatcher>> + 'a> {
|
|
|
|
match (slow, &self.slow_matchers) {
|
|
|
|
(true, Some(ref sm)) => Box::new(sm.iter().map(|e| Cow::Borrowed(e))),
|
|
|
|
(_, _) => Box::new(
|
|
|
|
self.fast_matchers
|
|
|
|
.iter()
|
|
|
|
.map(|e| Cow::Owned(SlowMatcher::Fast(e.clone()))),
|
|
|
|
),
|
|
|
|
}
|
|
|
|
}
|
2019-06-04 18:08:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
pub struct FileMeta {
|
|
|
|
// filename is not actually a utf8 string, but since we can't do regex on OsStr and can't get a &[u8] from OsStr either,
|
2019-06-06 09:00:13 +00:00
|
|
|
// and since we probably only want to do only matching on ascii stuff anyways, this is the filename as a string with non-valid bytes removed
|
2019-06-04 18:08:26 +00:00
|
|
|
pub lossy_filename: String,
|
2019-06-11 11:34:04 +00:00
|
|
|
// only given when slow matching is enabled
|
|
|
|
pub mimetype: Option<String>,
|
2019-06-04 18:08:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
pub trait GetMetadata {
|
2019-06-06 21:43:30 +00:00
|
|
|
fn metadata(&self) -> &AdapterMeta;
|
2019-06-04 18:08:26 +00:00
|
|
|
}
|
|
|
|
pub trait FileAdapter: GetMetadata {
|
2019-06-06 09:00:13 +00:00
|
|
|
fn adapt(&self, a: AdaptInfo) -> Fallible<()>;
|
|
|
|
}
|
|
|
|
pub struct AdaptInfo<'a> {
|
2019-06-06 21:43:30 +00:00
|
|
|
/// file path. May not be an actual file on the file system (e.g. in an archive). Used for matching file extensions.
|
2019-06-06 09:00:13 +00:00
|
|
|
pub filepath_hint: &'a Path,
|
2019-06-06 21:43:30 +00:00
|
|
|
/// true if filepath_hint is an actual file on the file system
|
|
|
|
pub is_real_file: bool,
|
2019-06-07 13:43:19 +00:00
|
|
|
/// depth at which this file is in archives. 0 for real filesystem
|
|
|
|
pub archive_recursion_depth: i32,
|
2019-06-06 21:50:58 +00:00
|
|
|
/// stream to read the file from. can be from a file or from some decoder
|
2019-06-06 09:00:13 +00:00
|
|
|
pub inp: &'a mut dyn Read,
|
2019-06-06 21:50:58 +00:00
|
|
|
/// stream to write to. will be written to from a different thread
|
2019-06-06 09:00:13 +00:00
|
|
|
pub oup: &'a mut (dyn Write + Send),
|
2019-06-06 21:50:58 +00:00
|
|
|
/// prefix every output line with this string to better indicate the file's location if it is in some archive
|
2019-06-06 09:00:13 +00:00
|
|
|
pub line_prefix: &'a str,
|
|
|
|
// pub adapt_subobject: &'a dyn Fn(AdaptInfo) -> Fallible<()>,
|
2019-06-07 21:04:18 +00:00
|
|
|
pub config: PreprocConfig<'a>,
|
2019-06-04 18:08:26 +00:00
|
|
|
}
|
|
|
|
|
2019-06-05 19:28:35 +00:00
|
|
|
pub fn extension_to_regex(extension: &str) -> Regex {
|
|
|
|
Regex::new(&format!(".*\\.{}", ®ex::escape(extension))).expect("we know this regex compiles")
|
2019-06-04 18:08:26 +00:00
|
|
|
}
|
|
|
|
|
2019-06-05 19:28:35 +00:00
|
|
|
pub fn get_adapters() -> Vec<Rc<dyn FileAdapter>> {
|
2019-06-11 12:40:58 +00:00
|
|
|
// order in descending priority
|
2019-06-04 18:08:26 +00:00
|
|
|
let adapters: Vec<Rc<dyn FileAdapter>> = vec![
|
2019-06-06 21:43:30 +00:00
|
|
|
Rc::new(ffmpeg::FFmpegAdapter),
|
|
|
|
Rc::new(pandoc::PandocAdapter),
|
|
|
|
Rc::new(poppler::PopplerAdapter),
|
|
|
|
Rc::new(zip::ZipAdapter),
|
|
|
|
Rc::new(tar::TarAdapter),
|
2019-06-07 14:57:11 +00:00
|
|
|
Rc::new(sqlite::SqliteAdapter),
|
2019-06-04 18:08:26 +00:00
|
|
|
];
|
2019-06-05 19:28:35 +00:00
|
|
|
adapters
|
|
|
|
}
|
2019-06-04 18:08:26 +00:00
|
|
|
|
2019-06-11 11:34:04 +00:00
|
|
|
pub fn get_adapters_filtered<T: AsRef<str>>(
|
|
|
|
adapter_names: &[T],
|
|
|
|
) -> Fallible<Vec<Rc<dyn FileAdapter>>> {
|
2019-06-07 22:04:48 +00:00
|
|
|
let all_adapters = get_adapters();
|
|
|
|
let adapters = if !adapter_names.is_empty() {
|
|
|
|
let adapters_map: HashMap<_, _> = all_adapters
|
|
|
|
.iter()
|
|
|
|
.map(|e| (e.metadata().name.clone(), e.clone()))
|
|
|
|
.collect();
|
|
|
|
let mut adapters = vec![];
|
|
|
|
let mut subtractive = false;
|
|
|
|
for (i, name) in adapter_names.iter().enumerate() {
|
2019-06-11 11:34:04 +00:00
|
|
|
let mut name = name.as_ref();
|
|
|
|
if i == 0 && (name.starts_with('-')) {
|
2019-06-07 22:04:48 +00:00
|
|
|
subtractive = true;
|
|
|
|
name = &name[1..];
|
|
|
|
adapters = all_adapters.clone();
|
|
|
|
}
|
|
|
|
if subtractive {
|
|
|
|
let inx = adapters
|
|
|
|
.iter()
|
2019-06-11 11:34:04 +00:00
|
|
|
.position(|a| a.metadata().name == name)
|
2019-06-07 22:04:48 +00:00
|
|
|
.ok_or_else(|| format_err!("Could not remove {}: Not in list", name))?;
|
|
|
|
adapters.remove(inx);
|
|
|
|
} else {
|
|
|
|
adapters.push(
|
|
|
|
adapters_map
|
|
|
|
.get(name)
|
|
|
|
.ok_or_else(|| format_err!("Unknown adapter: \"{}\"", name))?
|
|
|
|
.clone(),
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
adapters
|
|
|
|
} else {
|
|
|
|
all_adapters
|
|
|
|
};
|
|
|
|
debug!(
|
|
|
|
"Chosen adapters: {}",
|
|
|
|
adapters
|
|
|
|
.iter()
|
|
|
|
.map(|a| a.metadata().name.clone())
|
|
|
|
.collect::<Vec<String>>()
|
|
|
|
.join(",")
|
|
|
|
);
|
|
|
|
Ok(adapters)
|
|
|
|
}
|
2019-06-11 11:34:04 +00:00
|
|
|
|
|
|
|
pub fn adapter_matcher<T: AsRef<str>>(
|
|
|
|
adapter_names: &[T],
|
|
|
|
slow: bool,
|
2019-06-07 22:04:48 +00:00
|
|
|
) -> Fallible<impl Fn(FileMeta) -> Option<Rc<dyn FileAdapter>>> {
|
|
|
|
let adapters = get_adapters_filtered(adapter_names)?;
|
2019-06-11 12:40:58 +00:00
|
|
|
// need order later
|
|
|
|
let adapter_names: Vec<String> = adapters.iter().map(|e| e.metadata().name.clone()).collect();
|
2019-06-04 18:08:26 +00:00
|
|
|
let mut fname_regexes = vec![];
|
2019-06-11 11:34:04 +00:00
|
|
|
let mut mime_regexes = vec![];
|
2019-06-04 18:08:26 +00:00
|
|
|
for adapter in adapters.into_iter() {
|
|
|
|
let metadata = adapter.metadata();
|
2019-06-11 11:34:04 +00:00
|
|
|
use SlowMatcher::*;
|
|
|
|
for matcher in metadata.get_matchers(slow) {
|
|
|
|
match matcher.as_ref() {
|
|
|
|
MimeType(re) => mime_regexes.push((re.clone(), adapter.clone())),
|
|
|
|
Fast(FastMatcher::FileExtension(re)) => {
|
2019-06-05 19:28:35 +00:00
|
|
|
fname_regexes.push((extension_to_regex(re), adapter.clone()))
|
|
|
|
}
|
2019-06-04 18:08:26 +00:00
|
|
|
};
|
|
|
|
}
|
|
|
|
}
|
|
|
|
let fname_regex_set = RegexSet::new(fname_regexes.iter().map(|p| p.0.as_str()))?;
|
2019-06-11 11:34:04 +00:00
|
|
|
let mime_regex_set = RegexSet::new(mime_regexes.iter().map(|p| p.0.as_str()))?;
|
2019-06-06 21:43:30 +00:00
|
|
|
Ok(move |meta: FileMeta| {
|
2019-06-11 11:34:04 +00:00
|
|
|
let fname_matches: Vec<_> = fname_regex_set
|
|
|
|
.matches(&meta.lossy_filename)
|
|
|
|
.into_iter()
|
|
|
|
.collect();
|
|
|
|
let mime_matches: Vec<_> = if slow {
|
|
|
|
mime_regex_set
|
|
|
|
.matches(&meta.mimetype.expect("No mimetype?"))
|
|
|
|
.into_iter()
|
|
|
|
.collect()
|
|
|
|
} else {
|
|
|
|
vec![]
|
|
|
|
};
|
|
|
|
if fname_matches.len() + mime_matches.len() > 1 {
|
2019-06-11 12:40:58 +00:00
|
|
|
// get first according to original priority list...
|
|
|
|
let fa = fname_matches.iter().map(|e| fname_regexes[*e].1.clone());
|
|
|
|
let fb = mime_matches.iter().map(|e| mime_regexes[*e].1.clone());
|
|
|
|
let mut v = vec![];
|
|
|
|
v.extend(fa);
|
|
|
|
v.extend(fb);
|
|
|
|
v.sort_by_key(|e| {
|
|
|
|
(adapter_names
|
|
|
|
.iter()
|
|
|
|
.position(|r| r == &e.metadata().name)
|
|
|
|
.expect("impossib7"))
|
|
|
|
});
|
|
|
|
eprintln!(
|
|
|
|
"Warning: found multiple adapters for {}:",
|
|
|
|
meta.lossy_filename
|
|
|
|
);
|
|
|
|
for mmatch in v.iter() {
|
|
|
|
eprintln!(" - {}", mmatch.metadata().name);
|
2019-06-11 11:34:04 +00:00
|
|
|
}
|
2019-06-11 12:40:58 +00:00
|
|
|
return Some(v[0].clone());
|
2019-06-11 11:34:04 +00:00
|
|
|
}
|
2019-06-11 11:43:01 +00:00
|
|
|
if mime_matches.is_empty() {
|
|
|
|
if fname_matches.is_empty() {
|
2019-06-11 11:34:04 +00:00
|
|
|
None
|
|
|
|
} else {
|
|
|
|
Some(fname_regexes[fname_matches[0]].1.clone())
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
Some(mime_regexes[mime_matches[0]].1.clone())
|
2019-06-04 18:08:26 +00:00
|
|
|
}
|
2019-06-06 21:43:30 +00:00
|
|
|
})
|
2019-06-04 18:08:26 +00:00
|
|
|
}
|