add slow matching (base)

This commit is contained in:
phiresky 2019-06-11 13:34:04 +02:00
parent 9a036fdd4e
commit 0489a49d66
12 changed files with 137 additions and 63 deletions

View File

@ -15,10 +15,11 @@ lazy_static! {
name: "ffmpeg".to_owned(),
version: 1,
description: "Uses ffmpeg to extract video metadata and subtitles".to_owned(),
matchers: EXTENSIONS
fast_matchers: EXTENSIONS
.iter()
.map(|s| Matcher::FileExtension(s.to_string()))
.map(|s| FastMatcher::FileExtension(s.to_string()))
.collect(),
slow_matchers: None
};
}

View File

@ -9,19 +9,34 @@ use crate::preproc::PreprocConfig;
use failure::*;
use log::*;
use regex::{Regex, RegexSet};
use std::borrow::Borrow;
use std::borrow::Cow;
use std::collections::HashMap;
use std::io::prelude::*;
use std::iter::Iterator;
use std::path::Path;
use std::rc::Rc;
//pub use ffmpeg::FffmpegAdapter;
pub enum Matcher {
#[derive(Clone)]
pub enum FastMatcher {
// MimeType(Regex),
/**
* without the dot. e.g. "jpg" or "tar.gz" matched as /.*\.ext$/
* without the leading dot, e.g. "jpg" or "tar.gz". Matched as /.*\.ext$/
*
*/
FileExtension(String),
// todo: maybe add others, e.g. regex on whole filename or even paths
// todo: maybe allow matching a directory (e.g. /var/lib/postgres)
}
#[derive(Clone)]
pub enum SlowMatcher {
/// any type of fast matcher
Fast(FastMatcher),
///
/// match by exact mime type extracted using tree_magic
/// TODO: allow match ignoring suffix etc?
MimeType(String),
}
pub struct AdapterMeta {
@ -30,14 +45,32 @@ pub struct AdapterMeta {
/// version identifier. used to key cache entries, change if your output format changes
pub version: i32,
pub description: String,
pub matchers: Vec<Matcher>,
/// list of matchers (interpreted as ORed)
pub fast_matchers: Vec<FastMatcher>,
/// list of matchers when we have mime type detection active (interpreted as ORed)
/// warning: this *overrides* the fast matchers
pub slow_matchers: Option<Vec<SlowMatcher>>,
}
impl AdapterMeta {
// todo: this is pretty ugly
fn get_matchers<'a>(&'a self, slow: bool) -> Box<dyn Iterator<Item = Cow<SlowMatcher>> + 'a> {
match (slow, &self.slow_matchers) {
(true, Some(ref sm)) => Box::new(sm.iter().map(|e| Cow::Borrowed(e))),
(_, _) => Box::new(
self.fast_matchers
.iter()
.map(|e| Cow::Owned(SlowMatcher::Fast(e.clone()))),
),
}
}
}
pub struct FileMeta {
// filename is not actually a utf8 string, but since we can't do regex on OsStr and can't get a &[u8] from OsStr either,
// and since we probably only want to do only matching on ascii stuff anyways, this is the filename as a string with non-valid bytes removed
pub lossy_filename: String,
// pub mimetype: String,
// only given when slow matching is enabled
pub mimetype: Option<String>,
}
pub trait GetMetadata {
@ -79,7 +112,9 @@ pub fn get_adapters() -> Vec<Rc<dyn FileAdapter>> {
adapters
}
pub fn get_adapters_filtered(adapter_names: &Vec<String>) -> Fallible<Vec<Rc<dyn FileAdapter>>> {
pub fn get_adapters_filtered<T: AsRef<str>>(
adapter_names: &[T],
) -> Fallible<Vec<Rc<dyn FileAdapter>>> {
let all_adapters = get_adapters();
let adapters = if !adapter_names.is_empty() {
let adapters_map: HashMap<_, _> = all_adapters
@ -89,8 +124,8 @@ pub fn get_adapters_filtered(adapter_names: &Vec<String>) -> Fallible<Vec<Rc<dyn
let mut adapters = vec![];
let mut subtractive = false;
for (i, name) in adapter_names.iter().enumerate() {
let mut name = &name[..];
if i == 0 && name.starts_with("-") {
let mut name = name.as_ref();
if i == 0 && (name.starts_with('-')) {
subtractive = true;
name = &name[1..];
adapters = all_adapters.clone();
@ -98,7 +133,7 @@ pub fn get_adapters_filtered(adapter_names: &Vec<String>) -> Fallible<Vec<Rc<dyn
if subtractive {
let inx = adapters
.iter()
.position(|a| &a.metadata().name == name)
.position(|a| a.metadata().name == name)
.ok_or_else(|| format_err!("Could not remove {}: Not in list", name))?;
adapters.remove(inx);
} else {
@ -124,34 +159,58 @@ pub fn get_adapters_filtered(adapter_names: &Vec<String>) -> Fallible<Vec<Rc<dyn
);
Ok(adapters)
}
pub fn adapter_matcher(
adapter_names: &Vec<String>,
pub fn adapter_matcher<T: AsRef<str>>(
adapter_names: &[T],
slow: bool,
) -> Fallible<impl Fn(FileMeta) -> Option<Rc<dyn FileAdapter>>> {
let adapters = get_adapters_filtered(adapter_names)?;
let mut fname_regexes = vec![];
//let mut mime_regexes = vec![];
let mut mime_regexes = vec![];
for adapter in adapters.into_iter() {
let metadata = adapter.metadata();
for matcher in &metadata.matchers {
match matcher {
//Matcher::MimeType(re) => mime_regexes.push((re.clone(), adapter.clone())),
Matcher::FileExtension(re) => {
use SlowMatcher::*;
for matcher in metadata.get_matchers(slow) {
match matcher.as_ref() {
MimeType(re) => mime_regexes.push((re.clone(), adapter.clone())),
Fast(FastMatcher::FileExtension(re)) => {
fname_regexes.push((extension_to_regex(re), adapter.clone()))
}
};
}
}
let fname_regex_set = RegexSet::new(fname_regexes.iter().map(|p| p.0.as_str()))?;
//let mime_regex_set = RegexSet::new(mime_regexes.iter().map(|p| p.0.as_str()))?;
let mime_regex_set = RegexSet::new(mime_regexes.iter().map(|p| p.0.as_str()))?;
Ok(move |meta: FileMeta| {
// todo: handle multiple conflicting matches
let matches = fname_regex_set.matches(&meta.lossy_filename);
match matches.iter().next() {
Some(m) => Some(fname_regexes[m].1.clone()),
None => None,
let fname_matches: Vec<_> = fname_regex_set
.matches(&meta.lossy_filename)
.into_iter()
.collect();
let mime_matches: Vec<_> = if slow {
mime_regex_set
.matches(&meta.mimetype.expect("No mimetype?"))
.into_iter()
.collect()
} else {
vec![]
};
if fname_matches.len() + mime_matches.len() > 1 {
eprintln!("Found multiple adapters for {}:", meta.lossy_filename);
for mmatch in mime_matches.iter() {
eprintln!(" - {}", mime_regexes[*mmatch].1.metadata().name);
}
for fmatch in fname_matches.iter() {
eprintln!(" - {}", fname_regexes[*fmatch].1.metadata().name);
}
}
if mime_matches.len() == 0 {
if fname_matches.len() == 0 {
None
} else {
Some(fname_regexes[fname_matches[0]].1.clone())
}
} else {
Some(mime_regexes[mime_matches[0]].1.clone())
}
/*for m in mime_regex_set.matches(&meta.mimetype) {
return Some(mime_regexes[m].1.clone());
}*/
})
}

View File

@ -4,7 +4,7 @@ use spawning::SpawningFileAdapter;
use std::process::Command;
// from https://github.com/jgm/pandoc/blob/master/src/Text/Pandoc/App/FormatHeuristics.hs
// excluding formats that could cause problems (db = sqlite) or that are already text formats (e.g. xml-based)
// excluding formats that could cause problems (.db ?= sqlite) or that are already text formats (e.g. xml-based)
//"db" -> Just "docbook"
//"adoc" -> Just "asciidoc"
//"asciidoc" -> Just "asciidoc"
@ -46,10 +46,11 @@ lazy_static! {
name: "pandoc".to_owned(),
version: 1,
description: "Uses pandoc to convert binary/unreadable text documents to plain text markdown-like text".to_owned(),
matchers: EXTENSIONS
fast_matchers: EXTENSIONS
.iter()
.map(|s| Matcher::FileExtension(s.to_string()))
.map(|s| FastMatcher::FileExtension(s.to_string()))
.collect(),
slow_matchers: None
};
}
#[derive(Default)]

View File

@ -12,10 +12,11 @@ lazy_static! {
version: 1,
description: "Uses pdftotext (from poppler-utils) to extract plain text from PDF files"
.to_owned(),
matchers: EXTENSIONS
fast_matchers: EXTENSIONS
.iter()
.map(|s| Matcher::FileExtension(s.to_string()))
.map(|s| FastMatcher::FileExtension(s.to_string()))
.collect(),
slow_matchers: None
};
}
#[derive(Default)]

View File

@ -14,10 +14,13 @@ lazy_static! {
description:
"Uses sqlite bindings to convert sqlite databases into a simple plain text format"
.to_owned(),
matchers: EXTENSIONS
fast_matchers: EXTENSIONS
.iter()
.map(|s| Matcher::FileExtension(s.to_string()))
.map(|s| FastMatcher::FileExtension(s.to_string()))
.collect(),
slow_matchers: Some(vec![SlowMatcher::MimeType(
"application/x-sqlite3".to_owned()
)])
};
}

View File

@ -13,10 +13,11 @@ lazy_static! {
name: "tar".to_owned(),
version: 1,
description: "Reads a tar file as a stream and recurses down into its contents".to_owned(),
matchers: EXTENSIONS
fast_matchers: EXTENSIONS
.iter()
.map(|s| Matcher::FileExtension(s.to_string()))
.map(|s| FastMatcher::FileExtension(s.to_string()))
.collect(),
slow_matchers: None
};
}
#[derive(Default)]

View File

@ -14,10 +14,11 @@ lazy_static! {
name: "zip".to_owned(),
version: 1,
description: "Reads a zip file as a stream and recurses down into its contents".to_owned(),
matchers: EXTENSIONS
fast_matchers: EXTENSIONS
.iter()
.map(|s| Matcher::FileExtension(s.to_string()))
.map(|s| FastMatcher::FileExtension(s.to_string()))
.collect(),
slow_matchers: None
};
}
#[derive(Default)]

View File

@ -32,58 +32,65 @@ set_default!(max_archive_recursion, 4, i32);
#[structopt(rename_all = "kebab-case", set_term_width = 80)]
pub struct RgaArgs {
#[serde(default, skip_serializing_if = "is_default")]
#[structopt(long, help = "Disable caching of results")]
pub rga_no_cache: bool,
#[structopt(long = "--rga-no-cache", help = "Disable caching of results")]
pub no_cache: bool,
#[serde(default, skip_serializing_if = "is_default")]
#[structopt(
long,
long = "--rga-accurate",
help = "Use more accurate but slower matching by mime type"
)]
pub accurate: bool,
#[serde(default, skip_serializing_if = "is_default")]
#[structopt(
long = "--rga-adapters",
require_equals = true,
require_delimiter = true,
help = "Change which adapters to use and in which priority order (descending)"
)]
pub rga_adapters: Vec<String>,
pub adapters: Vec<String>,
#[serde(
default = "def_cache_max_blob_len",
skip_serializing_if = "def_cache_max_blob_len_if"
)]
#[structopt(
long,
long = "--rga-cache-max-blob-len",
default_value = "2000000",
help = "Max compressed size to cache",
long_help = "Longest byte length (after compression) to store in cache. Longer adapter outputs will not be cached and recomputed every time."
)]
pub rga_cache_max_blob_len: u32,
pub cache_max_blob_len: u32,
#[serde(
default = "def_cache_compression_level",
skip_serializing_if = "def_cache_compression_level_if"
)]
#[structopt(
long,
long = "--rga-cache-compression-level",
default_value = "12",
require_equals = true,
help = "ZSTD compression level to apply to adapter outputs before storing in cache db"
)]
pub rga_cache_compression_level: u32,
pub cache_compression_level: u32,
#[serde(
default = "def_max_archive_recursion",
skip_serializing_if = "def_max_archive_recursion_if"
)]
#[structopt(
long,
long = "--rga-max-archive-recursion",
default_value = "4",
require_equals = true,
help = "Maximum nestedness of archives to recurse into"
)]
pub rga_max_archive_recursion: i32,
pub max_archive_recursion: i32,
// these arguments stop the process, so don't serialize them
#[serde(skip)]
#[structopt(long, help = "List all known adapters")]
pub rga_list_adapters: bool,
#[structopt(long = "--rga-list-adapters", help = "List all known adapters")]
pub list_adapters: bool,
#[serde(skip)]
#[structopt(long, help = "Show help for ripgrep itself")]

View File

@ -21,7 +21,7 @@ fn main() -> Fallible<()> {
let i = File::open(&path)?;
let mut o = std::io::stdout();
let cache = if args.rga_no_cache {
let cache = if args.no_cache {
None
} else {
Some(rga::preproc_cache::open()?)

View File

@ -62,17 +62,17 @@ fn main() -> Fallible<()> {
env_logger::init();
let (args, passthrough_args) = split_args()?;
let adapters = get_adapters_filtered(&args.rga_adapters)?;
let adapters = get_adapters_filtered(&args.adapters)?;
if args.rga_list_adapters {
if args.list_adapters {
println!("Adapters:\n");
for adapter in adapters {
let meta = adapter.metadata();
let matchers = meta
.matchers
.fast_matchers
.iter()
.map(|m| match m {
Matcher::FileExtension(ext) => format!(".{}", ext),
FastMatcher::FileExtension(ext) => format!(".{}", ext),
})
.collect::<Vec<_>>()
.join(", ");
@ -87,9 +87,9 @@ fn main() -> Fallible<()> {
let extensions = adapters
.iter()
.flat_map(|a| &a.metadata().matchers)
.flat_map(|a| &a.metadata().fast_matchers)
.filter_map(|m| match m {
Matcher::FileExtension(ext) => Some(ext as &str),
FastMatcher::FileExtension(ext) => Some(ext as &str),
})
.collect::<Vec<_>>()
.join(",");

View File

@ -47,7 +47,7 @@ impl<W: Write> Write for CachingWriter<W> {
Some(writer) => {
let wrote = writer.write(buf)?;
let compressed_len = writer.get_ref().len();
//eprintln!("wrote {} to zstd, len now {}", wrote, compressed_len);
trace!("wrote {} to zstd, len now {}", wrote, compressed_len);
if compressed_len > self.max_cache_size {
eprintln!("cache longer than max, dropping");
//writer.finish();

View File

@ -20,7 +20,6 @@ pub struct PreprocConfig<'a> {
*
*/
pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> {
let adapters = adapter_matcher(&ai.config.args.rga_adapters)?;
let AdaptInfo {
filepath_hint,
is_real_file,
@ -32,11 +31,12 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> {
..
} = ai;
let PreprocConfig { mut cache, args } = config;
let adapters = adapter_matcher(&args.adapters[..], args.accurate)?;
let filename = filepath_hint
.file_name()
.ok_or_else(|| format_err!("Empty filename"))?;
eprintln!("depth: {}", archive_recursion_depth);
if archive_recursion_depth >= args.rga_max_archive_recursion {
if archive_recursion_depth >= args.max_archive_recursion {
writeln!(oup, "{}[rga: max archive recursion reached]", line_prefix)?;
return Ok(());
}
@ -49,7 +49,7 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> {
)))?;
println!("mimetype: {:?}", mimetype);*/
let adapter = adapters(FileMeta {
// mimetype,
mimetype: None,
lossy_filename: filename.to_string_lossy().to_string(),
});
match adapter {
@ -77,8 +77,8 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> {
// wrapping BufWriter here gives ~10% perf boost
let mut compbuf = BufWriter::new(CachingWriter::new(
oup,
args.rga_cache_max_blob_len.try_into().unwrap(),
args.rga_cache_compression_level.try_into().unwrap(),
args.cache_max_blob_len.try_into().unwrap(),
args.cache_compression_level.try_into().unwrap(),
)?);
eprintln!("adapting...");
ad.adapt(AdaptInfo {