add slow matching (base)

This commit is contained in:
phiresky 2019-06-11 13:34:04 +02:00
parent 9a036fdd4e
commit 0489a49d66
12 changed files with 137 additions and 63 deletions

View File

@ -15,10 +15,11 @@ lazy_static! {
name: "ffmpeg".to_owned(), name: "ffmpeg".to_owned(),
version: 1, version: 1,
description: "Uses ffmpeg to extract video metadata and subtitles".to_owned(), description: "Uses ffmpeg to extract video metadata and subtitles".to_owned(),
matchers: EXTENSIONS fast_matchers: EXTENSIONS
.iter() .iter()
.map(|s| Matcher::FileExtension(s.to_string())) .map(|s| FastMatcher::FileExtension(s.to_string()))
.collect(), .collect(),
slow_matchers: None
}; };
} }

View File

@ -9,19 +9,34 @@ use crate::preproc::PreprocConfig;
use failure::*; use failure::*;
use log::*; use log::*;
use regex::{Regex, RegexSet}; use regex::{Regex, RegexSet};
use std::borrow::Borrow;
use std::borrow::Cow;
use std::collections::HashMap; use std::collections::HashMap;
use std::io::prelude::*; use std::io::prelude::*;
use std::iter::Iterator;
use std::path::Path; use std::path::Path;
use std::rc::Rc; use std::rc::Rc;
//pub use ffmpeg::FffmpegAdapter;
pub enum Matcher { #[derive(Clone)]
pub enum FastMatcher {
// MimeType(Regex), // MimeType(Regex),
/** /**
* without the dot. e.g. "jpg" or "tar.gz" matched as /.*\.ext$/ * without the leading dot, e.g. "jpg" or "tar.gz". Matched as /.*\.ext$/
* *
*/ */
FileExtension(String), FileExtension(String),
// todo: maybe add others, e.g. regex on whole filename or even paths
// todo: maybe allow matching a directory (e.g. /var/lib/postgres)
}
#[derive(Clone)]
pub enum SlowMatcher {
/// any type of fast matcher
Fast(FastMatcher),
///
/// match by exact mime type extracted using tree_magic
/// TODO: allow match ignoring suffix etc?
MimeType(String),
} }
pub struct AdapterMeta { pub struct AdapterMeta {
@ -30,14 +45,32 @@ pub struct AdapterMeta {
/// version identifier. used to key cache entries, change if your output format changes /// version identifier. used to key cache entries, change if your output format changes
pub version: i32, pub version: i32,
pub description: String, pub description: String,
pub matchers: Vec<Matcher>, /// list of matchers (interpreted as ORed)
pub fast_matchers: Vec<FastMatcher>,
/// list of matchers when we have mime type detection active (interpreted as ORed)
/// warning: this *overrides* the fast matchers
pub slow_matchers: Option<Vec<SlowMatcher>>,
}
impl AdapterMeta {
// todo: this is pretty ugly
fn get_matchers<'a>(&'a self, slow: bool) -> Box<dyn Iterator<Item = Cow<SlowMatcher>> + 'a> {
match (slow, &self.slow_matchers) {
(true, Some(ref sm)) => Box::new(sm.iter().map(|e| Cow::Borrowed(e))),
(_, _) => Box::new(
self.fast_matchers
.iter()
.map(|e| Cow::Owned(SlowMatcher::Fast(e.clone()))),
),
}
}
} }
pub struct FileMeta { pub struct FileMeta {
// filename is not actually a utf8 string, but since we can't do regex on OsStr and can't get a &[u8] from OsStr either, // filename is not actually a utf8 string, but since we can't do regex on OsStr and can't get a &[u8] from OsStr either,
// and since we probably only want to do only matching on ascii stuff anyways, this is the filename as a string with non-valid bytes removed // and since we probably only want to do only matching on ascii stuff anyways, this is the filename as a string with non-valid bytes removed
pub lossy_filename: String, pub lossy_filename: String,
// pub mimetype: String, // only given when slow matching is enabled
pub mimetype: Option<String>,
} }
pub trait GetMetadata { pub trait GetMetadata {
@ -79,7 +112,9 @@ pub fn get_adapters() -> Vec<Rc<dyn FileAdapter>> {
adapters adapters
} }
pub fn get_adapters_filtered(adapter_names: &Vec<String>) -> Fallible<Vec<Rc<dyn FileAdapter>>> { pub fn get_adapters_filtered<T: AsRef<str>>(
adapter_names: &[T],
) -> Fallible<Vec<Rc<dyn FileAdapter>>> {
let all_adapters = get_adapters(); let all_adapters = get_adapters();
let adapters = if !adapter_names.is_empty() { let adapters = if !adapter_names.is_empty() {
let adapters_map: HashMap<_, _> = all_adapters let adapters_map: HashMap<_, _> = all_adapters
@ -89,8 +124,8 @@ pub fn get_adapters_filtered(adapter_names: &Vec<String>) -> Fallible<Vec<Rc<dyn
let mut adapters = vec![]; let mut adapters = vec![];
let mut subtractive = false; let mut subtractive = false;
for (i, name) in adapter_names.iter().enumerate() { for (i, name) in adapter_names.iter().enumerate() {
let mut name = &name[..]; let mut name = name.as_ref();
if i == 0 && name.starts_with("-") { if i == 0 && (name.starts_with('-')) {
subtractive = true; subtractive = true;
name = &name[1..]; name = &name[1..];
adapters = all_adapters.clone(); adapters = all_adapters.clone();
@ -98,7 +133,7 @@ pub fn get_adapters_filtered(adapter_names: &Vec<String>) -> Fallible<Vec<Rc<dyn
if subtractive { if subtractive {
let inx = adapters let inx = adapters
.iter() .iter()
.position(|a| &a.metadata().name == name) .position(|a| a.metadata().name == name)
.ok_or_else(|| format_err!("Could not remove {}: Not in list", name))?; .ok_or_else(|| format_err!("Could not remove {}: Not in list", name))?;
adapters.remove(inx); adapters.remove(inx);
} else { } else {
@ -124,34 +159,58 @@ pub fn get_adapters_filtered(adapter_names: &Vec<String>) -> Fallible<Vec<Rc<dyn
); );
Ok(adapters) Ok(adapters)
} }
pub fn adapter_matcher(
adapter_names: &Vec<String>, pub fn adapter_matcher<T: AsRef<str>>(
adapter_names: &[T],
slow: bool,
) -> Fallible<impl Fn(FileMeta) -> Option<Rc<dyn FileAdapter>>> { ) -> Fallible<impl Fn(FileMeta) -> Option<Rc<dyn FileAdapter>>> {
let adapters = get_adapters_filtered(adapter_names)?; let adapters = get_adapters_filtered(adapter_names)?;
let mut fname_regexes = vec![]; let mut fname_regexes = vec![];
//let mut mime_regexes = vec![]; let mut mime_regexes = vec![];
for adapter in adapters.into_iter() { for adapter in adapters.into_iter() {
let metadata = adapter.metadata(); let metadata = adapter.metadata();
for matcher in &metadata.matchers { use SlowMatcher::*;
match matcher { for matcher in metadata.get_matchers(slow) {
//Matcher::MimeType(re) => mime_regexes.push((re.clone(), adapter.clone())), match matcher.as_ref() {
Matcher::FileExtension(re) => { MimeType(re) => mime_regexes.push((re.clone(), adapter.clone())),
Fast(FastMatcher::FileExtension(re)) => {
fname_regexes.push((extension_to_regex(re), adapter.clone())) fname_regexes.push((extension_to_regex(re), adapter.clone()))
} }
}; };
} }
} }
let fname_regex_set = RegexSet::new(fname_regexes.iter().map(|p| p.0.as_str()))?; let fname_regex_set = RegexSet::new(fname_regexes.iter().map(|p| p.0.as_str()))?;
//let mime_regex_set = RegexSet::new(mime_regexes.iter().map(|p| p.0.as_str()))?; let mime_regex_set = RegexSet::new(mime_regexes.iter().map(|p| p.0.as_str()))?;
Ok(move |meta: FileMeta| { Ok(move |meta: FileMeta| {
// todo: handle multiple conflicting matches let fname_matches: Vec<_> = fname_regex_set
let matches = fname_regex_set.matches(&meta.lossy_filename); .matches(&meta.lossy_filename)
match matches.iter().next() { .into_iter()
Some(m) => Some(fname_regexes[m].1.clone()), .collect();
None => None, let mime_matches: Vec<_> = if slow {
mime_regex_set
.matches(&meta.mimetype.expect("No mimetype?"))
.into_iter()
.collect()
} else {
vec![]
};
if fname_matches.len() + mime_matches.len() > 1 {
eprintln!("Found multiple adapters for {}:", meta.lossy_filename);
for mmatch in mime_matches.iter() {
eprintln!(" - {}", mime_regexes[*mmatch].1.metadata().name);
}
for fmatch in fname_matches.iter() {
eprintln!(" - {}", fname_regexes[*fmatch].1.metadata().name);
}
}
if mime_matches.len() == 0 {
if fname_matches.len() == 0 {
None
} else {
Some(fname_regexes[fname_matches[0]].1.clone())
}
} else {
Some(mime_regexes[mime_matches[0]].1.clone())
} }
/*for m in mime_regex_set.matches(&meta.mimetype) {
return Some(mime_regexes[m].1.clone());
}*/
}) })
} }

View File

@ -4,7 +4,7 @@ use spawning::SpawningFileAdapter;
use std::process::Command; use std::process::Command;
// from https://github.com/jgm/pandoc/blob/master/src/Text/Pandoc/App/FormatHeuristics.hs // from https://github.com/jgm/pandoc/blob/master/src/Text/Pandoc/App/FormatHeuristics.hs
// excluding formats that could cause problems (db = sqlite) or that are already text formats (e.g. xml-based) // excluding formats that could cause problems (.db ?= sqlite) or that are already text formats (e.g. xml-based)
//"db" -> Just "docbook" //"db" -> Just "docbook"
//"adoc" -> Just "asciidoc" //"adoc" -> Just "asciidoc"
//"asciidoc" -> Just "asciidoc" //"asciidoc" -> Just "asciidoc"
@ -46,10 +46,11 @@ lazy_static! {
name: "pandoc".to_owned(), name: "pandoc".to_owned(),
version: 1, version: 1,
description: "Uses pandoc to convert binary/unreadable text documents to plain text markdown-like text".to_owned(), description: "Uses pandoc to convert binary/unreadable text documents to plain text markdown-like text".to_owned(),
matchers: EXTENSIONS fast_matchers: EXTENSIONS
.iter() .iter()
.map(|s| Matcher::FileExtension(s.to_string())) .map(|s| FastMatcher::FileExtension(s.to_string()))
.collect(), .collect(),
slow_matchers: None
}; };
} }
#[derive(Default)] #[derive(Default)]

View File

@ -12,10 +12,11 @@ lazy_static! {
version: 1, version: 1,
description: "Uses pdftotext (from poppler-utils) to extract plain text from PDF files" description: "Uses pdftotext (from poppler-utils) to extract plain text from PDF files"
.to_owned(), .to_owned(),
matchers: EXTENSIONS fast_matchers: EXTENSIONS
.iter() .iter()
.map(|s| Matcher::FileExtension(s.to_string())) .map(|s| FastMatcher::FileExtension(s.to_string()))
.collect(), .collect(),
slow_matchers: None
}; };
} }
#[derive(Default)] #[derive(Default)]

View File

@ -14,10 +14,13 @@ lazy_static! {
description: description:
"Uses sqlite bindings to convert sqlite databases into a simple plain text format" "Uses sqlite bindings to convert sqlite databases into a simple plain text format"
.to_owned(), .to_owned(),
matchers: EXTENSIONS fast_matchers: EXTENSIONS
.iter() .iter()
.map(|s| Matcher::FileExtension(s.to_string())) .map(|s| FastMatcher::FileExtension(s.to_string()))
.collect(), .collect(),
slow_matchers: Some(vec![SlowMatcher::MimeType(
"application/x-sqlite3".to_owned()
)])
}; };
} }

View File

@ -13,10 +13,11 @@ lazy_static! {
name: "tar".to_owned(), name: "tar".to_owned(),
version: 1, version: 1,
description: "Reads a tar file as a stream and recurses down into its contents".to_owned(), description: "Reads a tar file as a stream and recurses down into its contents".to_owned(),
matchers: EXTENSIONS fast_matchers: EXTENSIONS
.iter() .iter()
.map(|s| Matcher::FileExtension(s.to_string())) .map(|s| FastMatcher::FileExtension(s.to_string()))
.collect(), .collect(),
slow_matchers: None
}; };
} }
#[derive(Default)] #[derive(Default)]

View File

@ -14,10 +14,11 @@ lazy_static! {
name: "zip".to_owned(), name: "zip".to_owned(),
version: 1, version: 1,
description: "Reads a zip file as a stream and recurses down into its contents".to_owned(), description: "Reads a zip file as a stream and recurses down into its contents".to_owned(),
matchers: EXTENSIONS fast_matchers: EXTENSIONS
.iter() .iter()
.map(|s| Matcher::FileExtension(s.to_string())) .map(|s| FastMatcher::FileExtension(s.to_string()))
.collect(), .collect(),
slow_matchers: None
}; };
} }
#[derive(Default)] #[derive(Default)]

View File

@ -32,58 +32,65 @@ set_default!(max_archive_recursion, 4, i32);
#[structopt(rename_all = "kebab-case", set_term_width = 80)] #[structopt(rename_all = "kebab-case", set_term_width = 80)]
pub struct RgaArgs { pub struct RgaArgs {
#[serde(default, skip_serializing_if = "is_default")] #[serde(default, skip_serializing_if = "is_default")]
#[structopt(long, help = "Disable caching of results")] #[structopt(long = "--rga-no-cache", help = "Disable caching of results")]
pub rga_no_cache: bool, pub no_cache: bool,
#[serde(default, skip_serializing_if = "is_default")] #[serde(default, skip_serializing_if = "is_default")]
#[structopt( #[structopt(
long, long = "--rga-accurate",
help = "Use more accurate but slower matching by mime type"
)]
pub accurate: bool,
#[serde(default, skip_serializing_if = "is_default")]
#[structopt(
long = "--rga-adapters",
require_equals = true, require_equals = true,
require_delimiter = true, require_delimiter = true,
help = "Change which adapters to use and in which priority order (descending)" help = "Change which adapters to use and in which priority order (descending)"
)] )]
pub rga_adapters: Vec<String>, pub adapters: Vec<String>,
#[serde( #[serde(
default = "def_cache_max_blob_len", default = "def_cache_max_blob_len",
skip_serializing_if = "def_cache_max_blob_len_if" skip_serializing_if = "def_cache_max_blob_len_if"
)] )]
#[structopt( #[structopt(
long, long = "--rga-cache-max-blob-len",
default_value = "2000000", default_value = "2000000",
help = "Max compressed size to cache", help = "Max compressed size to cache",
long_help = "Longest byte length (after compression) to store in cache. Longer adapter outputs will not be cached and recomputed every time." long_help = "Longest byte length (after compression) to store in cache. Longer adapter outputs will not be cached and recomputed every time."
)] )]
pub rga_cache_max_blob_len: u32, pub cache_max_blob_len: u32,
#[serde( #[serde(
default = "def_cache_compression_level", default = "def_cache_compression_level",
skip_serializing_if = "def_cache_compression_level_if" skip_serializing_if = "def_cache_compression_level_if"
)] )]
#[structopt( #[structopt(
long, long = "--rga-cache-compression-level",
default_value = "12", default_value = "12",
require_equals = true, require_equals = true,
help = "ZSTD compression level to apply to adapter outputs before storing in cache db" help = "ZSTD compression level to apply to adapter outputs before storing in cache db"
)] )]
pub rga_cache_compression_level: u32, pub cache_compression_level: u32,
#[serde( #[serde(
default = "def_max_archive_recursion", default = "def_max_archive_recursion",
skip_serializing_if = "def_max_archive_recursion_if" skip_serializing_if = "def_max_archive_recursion_if"
)] )]
#[structopt( #[structopt(
long, long = "--rga-max-archive-recursion",
default_value = "4", default_value = "4",
require_equals = true, require_equals = true,
help = "Maximum nestedness of archives to recurse into" help = "Maximum nestedness of archives to recurse into"
)] )]
pub rga_max_archive_recursion: i32, pub max_archive_recursion: i32,
// these arguments stop the process, so don't serialize them // these arguments stop the process, so don't serialize them
#[serde(skip)] #[serde(skip)]
#[structopt(long, help = "List all known adapters")] #[structopt(long = "--rga-list-adapters", help = "List all known adapters")]
pub rga_list_adapters: bool, pub list_adapters: bool,
#[serde(skip)] #[serde(skip)]
#[structopt(long, help = "Show help for ripgrep itself")] #[structopt(long, help = "Show help for ripgrep itself")]

View File

@ -21,7 +21,7 @@ fn main() -> Fallible<()> {
let i = File::open(&path)?; let i = File::open(&path)?;
let mut o = std::io::stdout(); let mut o = std::io::stdout();
let cache = if args.rga_no_cache { let cache = if args.no_cache {
None None
} else { } else {
Some(rga::preproc_cache::open()?) Some(rga::preproc_cache::open()?)

View File

@ -62,17 +62,17 @@ fn main() -> Fallible<()> {
env_logger::init(); env_logger::init();
let (args, passthrough_args) = split_args()?; let (args, passthrough_args) = split_args()?;
let adapters = get_adapters_filtered(&args.rga_adapters)?; let adapters = get_adapters_filtered(&args.adapters)?;
if args.rga_list_adapters { if args.list_adapters {
println!("Adapters:\n"); println!("Adapters:\n");
for adapter in adapters { for adapter in adapters {
let meta = adapter.metadata(); let meta = adapter.metadata();
let matchers = meta let matchers = meta
.matchers .fast_matchers
.iter() .iter()
.map(|m| match m { .map(|m| match m {
Matcher::FileExtension(ext) => format!(".{}", ext), FastMatcher::FileExtension(ext) => format!(".{}", ext),
}) })
.collect::<Vec<_>>() .collect::<Vec<_>>()
.join(", "); .join(", ");
@ -87,9 +87,9 @@ fn main() -> Fallible<()> {
let extensions = adapters let extensions = adapters
.iter() .iter()
.flat_map(|a| &a.metadata().matchers) .flat_map(|a| &a.metadata().fast_matchers)
.filter_map(|m| match m { .filter_map(|m| match m {
Matcher::FileExtension(ext) => Some(ext as &str), FastMatcher::FileExtension(ext) => Some(ext as &str),
}) })
.collect::<Vec<_>>() .collect::<Vec<_>>()
.join(","); .join(",");

View File

@ -47,7 +47,7 @@ impl<W: Write> Write for CachingWriter<W> {
Some(writer) => { Some(writer) => {
let wrote = writer.write(buf)?; let wrote = writer.write(buf)?;
let compressed_len = writer.get_ref().len(); let compressed_len = writer.get_ref().len();
//eprintln!("wrote {} to zstd, len now {}", wrote, compressed_len); trace!("wrote {} to zstd, len now {}", wrote, compressed_len);
if compressed_len > self.max_cache_size { if compressed_len > self.max_cache_size {
eprintln!("cache longer than max, dropping"); eprintln!("cache longer than max, dropping");
//writer.finish(); //writer.finish();

View File

@ -20,7 +20,6 @@ pub struct PreprocConfig<'a> {
* *
*/ */
pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> { pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> {
let adapters = adapter_matcher(&ai.config.args.rga_adapters)?;
let AdaptInfo { let AdaptInfo {
filepath_hint, filepath_hint,
is_real_file, is_real_file,
@ -32,11 +31,12 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> {
.. ..
} = ai; } = ai;
let PreprocConfig { mut cache, args } = config; let PreprocConfig { mut cache, args } = config;
let adapters = adapter_matcher(&args.adapters[..], args.accurate)?;
let filename = filepath_hint let filename = filepath_hint
.file_name() .file_name()
.ok_or_else(|| format_err!("Empty filename"))?; .ok_or_else(|| format_err!("Empty filename"))?;
eprintln!("depth: {}", archive_recursion_depth); eprintln!("depth: {}", archive_recursion_depth);
if archive_recursion_depth >= args.rga_max_archive_recursion { if archive_recursion_depth >= args.max_archive_recursion {
writeln!(oup, "{}[rga: max archive recursion reached]", line_prefix)?; writeln!(oup, "{}[rga: max archive recursion reached]", line_prefix)?;
return Ok(()); return Ok(());
} }
@ -49,7 +49,7 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> {
)))?; )))?;
println!("mimetype: {:?}", mimetype);*/ println!("mimetype: {:?}", mimetype);*/
let adapter = adapters(FileMeta { let adapter = adapters(FileMeta {
// mimetype, mimetype: None,
lossy_filename: filename.to_string_lossy().to_string(), lossy_filename: filename.to_string_lossy().to_string(),
}); });
match adapter { match adapter {
@ -77,8 +77,8 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> {
// wrapping BufWriter here gives ~10% perf boost // wrapping BufWriter here gives ~10% perf boost
let mut compbuf = BufWriter::new(CachingWriter::new( let mut compbuf = BufWriter::new(CachingWriter::new(
oup, oup,
args.rga_cache_max_blob_len.try_into().unwrap(), args.cache_max_blob_len.try_into().unwrap(),
args.rga_cache_compression_level.try_into().unwrap(), args.cache_compression_level.try_into().unwrap(),
)?); )?);
eprintln!("adapting..."); eprintln!("adapting...");
ad.adapt(AdaptInfo { ad.adapt(AdaptInfo {