ripgrep-all/src/adapters/mod.rs

pub mod ffmpeg;
pub mod pandoc;
pub mod poppler;
pub mod spawning;
pub mod sqlite;
pub mod tar;
pub mod zip;
use crate::preproc::PreprocConfig;
use failure::*;
use log::*;
use regex::{Regex, RegexSet};

use std::borrow::Cow;
use std::collections::HashMap;
use std::io::prelude::*;
use std::iter::Iterator;
use std::path::Path;
use std::rc::Rc;

#[derive(Clone)]
pub enum FastMatcher {
    // MimeType(Regex),
    /**
     * without the leading dot, e.g. "jpg" or "tar.gz". Matched as /.*\.ext$/
     *
     */
    FileExtension(String),
    // todo: maybe add others, e.g. regex on whole filename or even paths
    // todo: maybe allow matching a directory (e.g. /var/lib/postgres)
}

#[derive(Clone)]
pub enum SlowMatcher {
    /// any type of fast matcher
    Fast(FastMatcher),
    ///
    /// match by exact mime type extracted using tree_magic
    /// TODO: allow match ignoring suffix etc?
    MimeType(String),
}

pub struct AdapterMeta {
    /// unique short name of this adapter (a-z0-9 only)
    pub name: String,
    /// version identifier. used to key cache entries, change if your output format changes
    pub version: i32,
    pub description: String,
    /// list of matchers (interpreted as ORed)
    pub fast_matchers: Vec<FastMatcher>,
    /// list of matchers when we have mime type detection active (interpreted as ORed)
    /// warning: this *overrides* the fast matchers
    pub slow_matchers: Option<Vec<SlowMatcher>>,
}
impl AdapterMeta {
    // todo: this is pretty ugly
    fn get_matchers<'a>(&'a self, slow: bool) -> Box<dyn Iterator<Item = Cow<SlowMatcher>> + 'a> {
        match (slow, &self.slow_matchers) {
            (true, Some(ref sm)) => Box::new(sm.iter().map(|e| Cow::Borrowed(e))),
            (_, _) => Box::new(
                self.fast_matchers
                    .iter()
                    .map(|e| Cow::Owned(SlowMatcher::Fast(e.clone()))),
            ),
        }
    }
}

pub struct FileMeta {
    // filename is not actually a utf8 string, but since we can't do regex on OsStr and can't get a &[u8] from OsStr either,
    // and since we probably only want to do only matching on ascii stuff anyways, this is the filename as a string with non-valid bytes removed
    pub lossy_filename: String,
    // only given when slow matching is enabled
    pub mimetype: Option<String>,
}

pub trait GetMetadata {
    fn metadata(&self) -> &AdapterMeta;
}
pub trait FileAdapter: GetMetadata {
    fn adapt(&self, a: AdaptInfo) -> Fallible<()>;
}
pub struct AdaptInfo<'a> {
    /// file path. May not be an actual file on the file system (e.g. in an archive). Used for matching file extensions.
    pub filepath_hint: &'a Path,
    /// true if filepath_hint is an actual file on the file system
    pub is_real_file: bool,
    /// depth at which this file is in archives. 0 for real filesystem
    pub archive_recursion_depth: i32,
    /// stream to read the file from. can be from a file or from some decoder
    pub inp: &'a mut dyn Read,
    /// stream to write to. will be written to from a different thread
    pub oup: &'a mut (dyn Write + Send),
    /// prefix every output line with this string to better indicate the file's location if it is in some archive
    pub line_prefix: &'a str,
    // pub adapt_subobject: &'a dyn Fn(AdaptInfo) -> Fallible<()>,
    pub config: PreprocConfig<'a>,
}

pub fn extension_to_regex(extension: &str) -> Regex {
    Regex::new(&format!(".*\\.{}", &regex::escape(extension))).expect("we know this regex compiles")
}

pub fn get_adapters() -> Vec<Rc<dyn FileAdapter>> {
    let adapters: Vec<Rc<dyn FileAdapter>> = vec![
        Rc::new(ffmpeg::FFmpegAdapter),
        Rc::new(pandoc::PandocAdapter),
        Rc::new(poppler::PopplerAdapter),
        Rc::new(zip::ZipAdapter),
        Rc::new(tar::TarAdapter),
        Rc::new(sqlite::SqliteAdapter),
    ];
    adapters
}

pub fn get_adapters_filtered<T: AsRef<str>>(
    adapter_names: &[T],
) -> Fallible<Vec<Rc<dyn FileAdapter>>> {
    let all_adapters = get_adapters();
    let adapters = if !adapter_names.is_empty() {
        let adapters_map: HashMap<_, _> = all_adapters
            .iter()
            .map(|e| (e.metadata().name.clone(), e.clone()))
            .collect();
        let mut adapters = vec![];
        let mut subtractive = false;
        for (i, name) in adapter_names.iter().enumerate() {
            let mut name = name.as_ref();
            if i == 0 && (name.starts_with('-')) {
                subtractive = true;
                name = &name[1..];
                adapters = all_adapters.clone();
            }
            if subtractive {
                let inx = adapters
                    .iter()
                    .position(|a| a.metadata().name == name)
                    .ok_or_else(|| format_err!("Could not remove {}: Not in list", name))?;
                adapters.remove(inx);
            } else {
                adapters.push(
                    adapters_map
                        .get(name)
                        .ok_or_else(|| format_err!("Unknown adapter: \"{}\"", name))?
                        .clone(),
                );
            }
        }
        adapters
    } else {
        all_adapters
    };
    debug!(
        "Chosen adapters: {}",
        adapters
            .iter()
            .map(|a| a.metadata().name.clone())
            .collect::<Vec<String>>()
            .join(",")
    );
    Ok(adapters)
}

pub fn adapter_matcher<T: AsRef<str>>(
    adapter_names: &[T],
    slow: bool,
) -> Fallible<impl Fn(FileMeta) -> Option<Rc<dyn FileAdapter>>> {
    let adapters = get_adapters_filtered(adapter_names)?;
    let mut fname_regexes = vec![];
    let mut mime_regexes = vec![];
    for adapter in adapters.into_iter() {
        let metadata = adapter.metadata();
        use SlowMatcher::*;
        for matcher in metadata.get_matchers(slow) {
            match matcher.as_ref() {
                MimeType(re) => mime_regexes.push((re.clone(), adapter.clone())),
                Fast(FastMatcher::FileExtension(re)) => {
                    fname_regexes.push((extension_to_regex(re), adapter.clone()))
                }
            };
        }
    }
    let fname_regex_set = RegexSet::new(fname_regexes.iter().map(|p| p.0.as_str()))?;
    let mime_regex_set = RegexSet::new(mime_regexes.iter().map(|p| p.0.as_str()))?;
    Ok(move |meta: FileMeta| {
        let fname_matches: Vec<_> = fname_regex_set
            .matches(&meta.lossy_filename)
            .into_iter()
            .collect();
        let mime_matches: Vec<_> = if slow {
            mime_regex_set
                .matches(&meta.mimetype.expect("No mimetype?"))
                .into_iter()
                .collect()
        } else {
            vec![]
        };
        if fname_matches.len() + mime_matches.len() > 1 {
            eprintln!("Found multiple adapters for {}:", meta.lossy_filename);
            for mmatch in mime_matches.iter() {
                eprintln!(" - {}", mime_regexes[*mmatch].1.metadata().name);
            }
            for fmatch in fname_matches.iter() {
                eprintln!(" - {}", fname_regexes[*fmatch].1.metadata().name);
            }
        }
        if mime_matches.is_empty() {
            if fname_matches.is_empty() {
                None
            } else {
                Some(fname_regexes[fname_matches[0]].1.clone())
            }
        } else {
            Some(mime_regexes[mime_matches[0]].1.clone())
        }
    })
}
initial working version 2019-06-04 18:08:26 +00:00			`pub mod ffmpeg;`
			`pub mod pandoc;`
			`pub mod poppler;`
			`pub mod spawning;`
add sqlite adapter 2019-06-07 14:57:11 +00:00			`pub mod sqlite;`
tar adapter (broken compression) 2019-06-06 15:59:15 +00:00			`pub mod tar;`
add zip support! 2019-06-06 09:00:13 +00:00			`pub mod zip;`
pass around config object 2019-06-07 17:00:24 +00:00			`use crate::preproc::PreprocConfig;`
add zip support! 2019-06-06 09:00:13 +00:00			`use failure::*;`
actually implement choosing adapters 2019-06-07 22:04:48 +00:00			`use log::*;`
initial working version 2019-06-04 18:08:26 +00:00			`use regex::{Regex, RegexSet};`
split args 2019-06-11 11:43:01 +00:00
add slow matching (base) 2019-06-11 11:34:04 +00:00			`use std::borrow::Cow;`
actually implement choosing adapters 2019-06-07 22:04:48 +00:00			`use std::collections::HashMap;`
add zip support! 2019-06-06 09:00:13 +00:00			`use std::io::prelude::*;`
add slow matching (base) 2019-06-11 11:34:04 +00:00			`use std::iter::Iterator;`
use failure crate, etc 2019-06-05 19:28:35 +00:00			`use std::path::Path;`
initial working version 2019-06-04 18:08:26 +00:00			`use std::rc::Rc;`

add slow matching (base) 2019-06-11 11:34:04 +00:00			`#[derive(Clone)]`
			`pub enum FastMatcher {`
implement caching 2019-06-05 14:43:40 +00:00			`// MimeType(Regex),`
tar adapter (broken compression) 2019-06-06 15:59:15 +00:00			`/**`
add slow matching (base) 2019-06-11 11:34:04 +00:00			`* without the leading dot, e.g. "jpg" or "tar.gz". Matched as /.*\.ext$/`
tar adapter (broken compression) 2019-06-06 15:59:15 +00:00			`*`
			`*/`
use failure crate, etc 2019-06-05 19:28:35 +00:00			`FileExtension(String),`
add slow matching (base) 2019-06-11 11:34:04 +00:00			`// todo: maybe add others, e.g. regex on whole filename or even paths`
			`// todo: maybe allow matching a directory (e.g. /var/lib/postgres)`
			`}`

			`#[derive(Clone)]`
			`pub enum SlowMatcher {`
			`/// any type of fast matcher`
			`Fast(FastMatcher),`
			`///`
			`/// match by exact mime type extracted using tree_magic`
			`/// TODO: allow match ignoring suffix etc?`
			`MimeType(String),`
initial working version 2019-06-04 18:08:26 +00:00			`}`

			`pub struct AdapterMeta {`
actually implement choosing adapters 2019-06-07 22:04:48 +00:00			`/// unique short name of this adapter (a-z0-9 only)`
initial working version 2019-06-04 18:08:26 +00:00			`pub name: String,`
actually implement choosing adapters 2019-06-07 22:04:48 +00:00			`/// version identifier. used to key cache entries, change if your output format changes`
initial working version 2019-06-04 18:08:26 +00:00			`pub version: i32,`
actually implement choosing adapters 2019-06-07 22:04:48 +00:00			`pub description: String,`
add slow matching (base) 2019-06-11 11:34:04 +00:00			`/// list of matchers (interpreted as ORed)`
			`pub fast_matchers: Vec<FastMatcher>,`
			`/// list of matchers when we have mime type detection active (interpreted as ORed)`
			`/// warning: this overrides the fast matchers`
			`pub slow_matchers: Option<Vec<SlowMatcher>>,`
			`}`
			`impl AdapterMeta {`
			`// todo: this is pretty ugly`
			`fn get_matchers<'a>(&'a self, slow: bool) -> Box<dyn Iterator<Item = Cow<SlowMatcher>> + 'a> {`
			`match (slow, &self.slow_matchers) {`
			`(true, Some(ref sm)) => Box::new(sm.iter().map(\|e\| Cow::Borrowed(e))),`
			`(_, _) => Box::new(`
			`self.fast_matchers`
			`.iter()`
			`.map(\|e\| Cow::Owned(SlowMatcher::Fast(e.clone()))),`
			`),`
			`}`
			`}`
initial working version 2019-06-04 18:08:26 +00:00			`}`

			`pub struct FileMeta {`
			`// filename is not actually a utf8 string, but since we can't do regex on OsStr and can't get a &[u8] from OsStr either,`
add zip support! 2019-06-06 09:00:13 +00:00			`// and since we probably only want to do only matching on ascii stuff anyways, this is the filename as a string with non-valid bytes removed`
initial working version 2019-06-04 18:08:26 +00:00			`pub lossy_filename: String,`
add slow matching (base) 2019-06-11 11:34:04 +00:00			`// only given when slow matching is enabled`
			`pub mimetype: Option<String>,`
initial working version 2019-06-04 18:08:26 +00:00			`}`

			`pub trait GetMetadata {`
cleanup 2019-06-06 21:43:30 +00:00			`fn metadata(&self) -> &AdapterMeta;`
initial working version 2019-06-04 18:08:26 +00:00			`}`
			`pub trait FileAdapter: GetMetadata {`
add zip support! 2019-06-06 09:00:13 +00:00			`fn adapt(&self, a: AdaptInfo) -> Fallible<()>;`
			`}`
			`pub struct AdaptInfo<'a> {`
cleanup 2019-06-06 21:43:30 +00:00			`/// file path. May not be an actual file on the file system (e.g. in an archive). Used for matching file extensions.`
add zip support! 2019-06-06 09:00:13 +00:00			`pub filepath_hint: &'a Path,`
cleanup 2019-06-06 21:43:30 +00:00			`/// true if filepath_hint is an actual file on the file system`
			`pub is_real_file: bool,`
track recursion depth 2019-06-07 13:43:19 +00:00			`/// depth at which this file is in archives. 0 for real filesystem`
			`pub archive_recursion_depth: i32,`
docu 2019-06-06 21:50:58 +00:00			`/// stream to read the file from. can be from a file or from some decoder`
add zip support! 2019-06-06 09:00:13 +00:00			`pub inp: &'a mut dyn Read,`
docu 2019-06-06 21:50:58 +00:00			`/// stream to write to. will be written to from a different thread`
add zip support! 2019-06-06 09:00:13 +00:00			`pub oup: &'a mut (dyn Write + Send),`
docu 2019-06-06 21:50:58 +00:00			`/// prefix every output line with this string to better indicate the file's location if it is in some archive`
add zip support! 2019-06-06 09:00:13 +00:00			`pub line_prefix: &'a str,`
			`// pub adapt_subobject: &'a dyn Fn(AdaptInfo) -> Fallible<()>,`
more options, less constants 2019-06-07 21:04:18 +00:00			`pub config: PreprocConfig<'a>,`
initial working version 2019-06-04 18:08:26 +00:00			`}`

use failure crate, etc 2019-06-05 19:28:35 +00:00			`pub fn extension_to_regex(extension: &str) -> Regex {`
			`Regex::new(&format!(".*\\.{}", &regex::escape(extension))).expect("we know this regex compiles")`
initial working version 2019-06-04 18:08:26 +00:00			`}`

use failure crate, etc 2019-06-05 19:28:35 +00:00			`pub fn get_adapters() -> Vec<Rc<dyn FileAdapter>> {`
initial working version 2019-06-04 18:08:26 +00:00			`let adapters: Vec<Rc<dyn FileAdapter>> = vec![`
cleanup 2019-06-06 21:43:30 +00:00			`Rc::new(ffmpeg::FFmpegAdapter),`
			`Rc::new(pandoc::PandocAdapter),`
			`Rc::new(poppler::PopplerAdapter),`
			`Rc::new(zip::ZipAdapter),`
			`Rc::new(tar::TarAdapter),`
add sqlite adapter 2019-06-07 14:57:11 +00:00			`Rc::new(sqlite::SqliteAdapter),`
initial working version 2019-06-04 18:08:26 +00:00			`];`
use failure crate, etc 2019-06-05 19:28:35 +00:00			`adapters`
			`}`
initial working version 2019-06-04 18:08:26 +00:00
add slow matching (base) 2019-06-11 11:34:04 +00:00			`pub fn get_adapters_filtered<T: AsRef<str>>(`
			`adapter_names: &[T],`
			`) -> Fallible<Vec<Rc<dyn FileAdapter>>> {`
actually implement choosing adapters 2019-06-07 22:04:48 +00:00			`let all_adapters = get_adapters();`
			`let adapters = if !adapter_names.is_empty() {`
			`let adapters_map: HashMap<_, _> = all_adapters`
			`.iter()`
			`.map(\|e\| (e.metadata().name.clone(), e.clone()))`
			`.collect();`
			`let mut adapters = vec![];`
			`let mut subtractive = false;`
			`for (i, name) in adapter_names.iter().enumerate() {`
add slow matching (base) 2019-06-11 11:34:04 +00:00			`let mut name = name.as_ref();`
			`if i == 0 && (name.starts_with('-')) {`
actually implement choosing adapters 2019-06-07 22:04:48 +00:00			`subtractive = true;`
			`name = &name[1..];`
			`adapters = all_adapters.clone();`
			`}`
			`if subtractive {`
			`let inx = adapters`
			`.iter()`
add slow matching (base) 2019-06-11 11:34:04 +00:00			`.position(\|a\| a.metadata().name == name)`
actually implement choosing adapters 2019-06-07 22:04:48 +00:00			`.ok_or_else(\|\| format_err!("Could not remove {}: Not in list", name))?;`
			`adapters.remove(inx);`
			`} else {`
			`adapters.push(`
			`adapters_map`
			`.get(name)`
			`.ok_or_else(\|\| format_err!("Unknown adapter: \"{}\"", name))?`
			`.clone(),`
			`);`
			`}`
			`}`
			`adapters`
			`} else {`
			`all_adapters`
			`};`
			`debug!(`
			`"Chosen adapters: {}",`
			`adapters`
			`.iter()`
			`.map(\|a\| a.metadata().name.clone())`
			`.collect::<Vec<String>>()`
			`.join(",")`
			`);`
			`Ok(adapters)`
			`}`
add slow matching (base) 2019-06-11 11:34:04 +00:00
			`pub fn adapter_matcher<T: AsRef<str>>(`
			`adapter_names: &[T],`
			`slow: bool,`
actually implement choosing adapters 2019-06-07 22:04:48 +00:00			`) -> Fallible<impl Fn(FileMeta) -> Option<Rc<dyn FileAdapter>>> {`
			`let adapters = get_adapters_filtered(adapter_names)?;`
initial working version 2019-06-04 18:08:26 +00:00			`let mut fname_regexes = vec![];`
add slow matching (base) 2019-06-11 11:34:04 +00:00			`let mut mime_regexes = vec![];`
initial working version 2019-06-04 18:08:26 +00:00			`for adapter in adapters.into_iter() {`
			`let metadata = adapter.metadata();`
add slow matching (base) 2019-06-11 11:34:04 +00:00			`use SlowMatcher::*;`
			`for matcher in metadata.get_matchers(slow) {`
			`match matcher.as_ref() {`
			`MimeType(re) => mime_regexes.push((re.clone(), adapter.clone())),`
			`Fast(FastMatcher::FileExtension(re)) => {`
use failure crate, etc 2019-06-05 19:28:35 +00:00			`fname_regexes.push((extension_to_regex(re), adapter.clone()))`
			`}`
initial working version 2019-06-04 18:08:26 +00:00			`};`
			`}`
			`}`
			`let fname_regex_set = RegexSet::new(fname_regexes.iter().map(\|p\| p.0.as_str()))?;`
add slow matching (base) 2019-06-11 11:34:04 +00:00			`let mime_regex_set = RegexSet::new(mime_regexes.iter().map(\|p\| p.0.as_str()))?;`
cleanup 2019-06-06 21:43:30 +00:00			`Ok(move \|meta: FileMeta\| {`
add slow matching (base) 2019-06-11 11:34:04 +00:00			`let fname_matches: Vec<_> = fname_regex_set`
			`.matches(&meta.lossy_filename)`
			`.into_iter()`
			`.collect();`
			`let mime_matches: Vec<_> = if slow {`
			`mime_regex_set`
			`.matches(&meta.mimetype.expect("No mimetype?"))`
			`.into_iter()`
			`.collect()`
			`} else {`
			`vec![]`
			`};`
			`if fname_matches.len() + mime_matches.len() > 1 {`
			`eprintln!("Found multiple adapters for {}:", meta.lossy_filename);`
			`for mmatch in mime_matches.iter() {`
			`eprintln!(" - {}", mime_regexes[*mmatch].1.metadata().name);`
			`}`
			`for fmatch in fname_matches.iter() {`
			`eprintln!(" - {}", fname_regexes[*fmatch].1.metadata().name);`
			`}`
			`}`
split args 2019-06-11 11:43:01 +00:00			`if mime_matches.is_empty() {`
			`if fname_matches.is_empty() {`
add slow matching (base) 2019-06-11 11:34:04 +00:00			`None`
			`} else {`
			`Some(fname_regexes[fname_matches[0]].1.clone())`
			`}`
			`} else {`
			`Some(mime_regexes[mime_matches[0]].1.clone())`
initial working version 2019-06-04 18:08:26 +00:00			`}`
cleanup 2019-06-06 21:43:30 +00:00			`})`
initial working version 2019-06-04 18:08:26 +00:00			`}`