mirror of
https://github.com/FliegendeWurst/ripgrep-all.git
synced 2024-11-24 12:24:56 +00:00
add slow matching (base)
This commit is contained in:
parent
9a036fdd4e
commit
0489a49d66
@ -15,10 +15,11 @@ lazy_static! {
|
||||
name: "ffmpeg".to_owned(),
|
||||
version: 1,
|
||||
description: "Uses ffmpeg to extract video metadata and subtitles".to_owned(),
|
||||
matchers: EXTENSIONS
|
||||
fast_matchers: EXTENSIONS
|
||||
.iter()
|
||||
.map(|s| Matcher::FileExtension(s.to_string()))
|
||||
.map(|s| FastMatcher::FileExtension(s.to_string()))
|
||||
.collect(),
|
||||
slow_matchers: None
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -9,19 +9,34 @@ use crate::preproc::PreprocConfig;
|
||||
use failure::*;
|
||||
use log::*;
|
||||
use regex::{Regex, RegexSet};
|
||||
use std::borrow::Borrow;
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::io::prelude::*;
|
||||
use std::iter::Iterator;
|
||||
use std::path::Path;
|
||||
use std::rc::Rc;
|
||||
//pub use ffmpeg::FffmpegAdapter;
|
||||
|
||||
pub enum Matcher {
|
||||
#[derive(Clone)]
|
||||
pub enum FastMatcher {
|
||||
// MimeType(Regex),
|
||||
/**
|
||||
* without the dot. e.g. "jpg" or "tar.gz" matched as /.*\.ext$/
|
||||
* without the leading dot, e.g. "jpg" or "tar.gz". Matched as /.*\.ext$/
|
||||
*
|
||||
*/
|
||||
FileExtension(String),
|
||||
// todo: maybe add others, e.g. regex on whole filename or even paths
|
||||
// todo: maybe allow matching a directory (e.g. /var/lib/postgres)
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum SlowMatcher {
|
||||
/// any type of fast matcher
|
||||
Fast(FastMatcher),
|
||||
///
|
||||
/// match by exact mime type extracted using tree_magic
|
||||
/// TODO: allow match ignoring suffix etc?
|
||||
MimeType(String),
|
||||
}
|
||||
|
||||
pub struct AdapterMeta {
|
||||
@ -30,14 +45,32 @@ pub struct AdapterMeta {
|
||||
/// version identifier. used to key cache entries, change if your output format changes
|
||||
pub version: i32,
|
||||
pub description: String,
|
||||
pub matchers: Vec<Matcher>,
|
||||
/// list of matchers (interpreted as ORed)
|
||||
pub fast_matchers: Vec<FastMatcher>,
|
||||
/// list of matchers when we have mime type detection active (interpreted as ORed)
|
||||
/// warning: this *overrides* the fast matchers
|
||||
pub slow_matchers: Option<Vec<SlowMatcher>>,
|
||||
}
|
||||
impl AdapterMeta {
|
||||
// todo: this is pretty ugly
|
||||
fn get_matchers<'a>(&'a self, slow: bool) -> Box<dyn Iterator<Item = Cow<SlowMatcher>> + 'a> {
|
||||
match (slow, &self.slow_matchers) {
|
||||
(true, Some(ref sm)) => Box::new(sm.iter().map(|e| Cow::Borrowed(e))),
|
||||
(_, _) => Box::new(
|
||||
self.fast_matchers
|
||||
.iter()
|
||||
.map(|e| Cow::Owned(SlowMatcher::Fast(e.clone()))),
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FileMeta {
|
||||
// filename is not actually a utf8 string, but since we can't do regex on OsStr and can't get a &[u8] from OsStr either,
|
||||
// and since we probably only want to do only matching on ascii stuff anyways, this is the filename as a string with non-valid bytes removed
|
||||
pub lossy_filename: String,
|
||||
// pub mimetype: String,
|
||||
// only given when slow matching is enabled
|
||||
pub mimetype: Option<String>,
|
||||
}
|
||||
|
||||
pub trait GetMetadata {
|
||||
@ -79,7 +112,9 @@ pub fn get_adapters() -> Vec<Rc<dyn FileAdapter>> {
|
||||
adapters
|
||||
}
|
||||
|
||||
pub fn get_adapters_filtered(adapter_names: &Vec<String>) -> Fallible<Vec<Rc<dyn FileAdapter>>> {
|
||||
pub fn get_adapters_filtered<T: AsRef<str>>(
|
||||
adapter_names: &[T],
|
||||
) -> Fallible<Vec<Rc<dyn FileAdapter>>> {
|
||||
let all_adapters = get_adapters();
|
||||
let adapters = if !adapter_names.is_empty() {
|
||||
let adapters_map: HashMap<_, _> = all_adapters
|
||||
@ -89,8 +124,8 @@ pub fn get_adapters_filtered(adapter_names: &Vec<String>) -> Fallible<Vec<Rc<dyn
|
||||
let mut adapters = vec![];
|
||||
let mut subtractive = false;
|
||||
for (i, name) in adapter_names.iter().enumerate() {
|
||||
let mut name = &name[..];
|
||||
if i == 0 && name.starts_with("-") {
|
||||
let mut name = name.as_ref();
|
||||
if i == 0 && (name.starts_with('-')) {
|
||||
subtractive = true;
|
||||
name = &name[1..];
|
||||
adapters = all_adapters.clone();
|
||||
@ -98,7 +133,7 @@ pub fn get_adapters_filtered(adapter_names: &Vec<String>) -> Fallible<Vec<Rc<dyn
|
||||
if subtractive {
|
||||
let inx = adapters
|
||||
.iter()
|
||||
.position(|a| &a.metadata().name == name)
|
||||
.position(|a| a.metadata().name == name)
|
||||
.ok_or_else(|| format_err!("Could not remove {}: Not in list", name))?;
|
||||
adapters.remove(inx);
|
||||
} else {
|
||||
@ -124,34 +159,58 @@ pub fn get_adapters_filtered(adapter_names: &Vec<String>) -> Fallible<Vec<Rc<dyn
|
||||
);
|
||||
Ok(adapters)
|
||||
}
|
||||
pub fn adapter_matcher(
|
||||
adapter_names: &Vec<String>,
|
||||
|
||||
pub fn adapter_matcher<T: AsRef<str>>(
|
||||
adapter_names: &[T],
|
||||
slow: bool,
|
||||
) -> Fallible<impl Fn(FileMeta) -> Option<Rc<dyn FileAdapter>>> {
|
||||
let adapters = get_adapters_filtered(adapter_names)?;
|
||||
let mut fname_regexes = vec![];
|
||||
//let mut mime_regexes = vec![];
|
||||
let mut mime_regexes = vec![];
|
||||
for adapter in adapters.into_iter() {
|
||||
let metadata = adapter.metadata();
|
||||
for matcher in &metadata.matchers {
|
||||
match matcher {
|
||||
//Matcher::MimeType(re) => mime_regexes.push((re.clone(), adapter.clone())),
|
||||
Matcher::FileExtension(re) => {
|
||||
use SlowMatcher::*;
|
||||
for matcher in metadata.get_matchers(slow) {
|
||||
match matcher.as_ref() {
|
||||
MimeType(re) => mime_regexes.push((re.clone(), adapter.clone())),
|
||||
Fast(FastMatcher::FileExtension(re)) => {
|
||||
fname_regexes.push((extension_to_regex(re), adapter.clone()))
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
let fname_regex_set = RegexSet::new(fname_regexes.iter().map(|p| p.0.as_str()))?;
|
||||
//let mime_regex_set = RegexSet::new(mime_regexes.iter().map(|p| p.0.as_str()))?;
|
||||
let mime_regex_set = RegexSet::new(mime_regexes.iter().map(|p| p.0.as_str()))?;
|
||||
Ok(move |meta: FileMeta| {
|
||||
// todo: handle multiple conflicting matches
|
||||
let matches = fname_regex_set.matches(&meta.lossy_filename);
|
||||
match matches.iter().next() {
|
||||
Some(m) => Some(fname_regexes[m].1.clone()),
|
||||
None => None,
|
||||
let fname_matches: Vec<_> = fname_regex_set
|
||||
.matches(&meta.lossy_filename)
|
||||
.into_iter()
|
||||
.collect();
|
||||
let mime_matches: Vec<_> = if slow {
|
||||
mime_regex_set
|
||||
.matches(&meta.mimetype.expect("No mimetype?"))
|
||||
.into_iter()
|
||||
.collect()
|
||||
} else {
|
||||
vec![]
|
||||
};
|
||||
if fname_matches.len() + mime_matches.len() > 1 {
|
||||
eprintln!("Found multiple adapters for {}:", meta.lossy_filename);
|
||||
for mmatch in mime_matches.iter() {
|
||||
eprintln!(" - {}", mime_regexes[*mmatch].1.metadata().name);
|
||||
}
|
||||
for fmatch in fname_matches.iter() {
|
||||
eprintln!(" - {}", fname_regexes[*fmatch].1.metadata().name);
|
||||
}
|
||||
}
|
||||
if mime_matches.len() == 0 {
|
||||
if fname_matches.len() == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(fname_regexes[fname_matches[0]].1.clone())
|
||||
}
|
||||
} else {
|
||||
Some(mime_regexes[mime_matches[0]].1.clone())
|
||||
}
|
||||
/*for m in mime_regex_set.matches(&meta.mimetype) {
|
||||
return Some(mime_regexes[m].1.clone());
|
||||
}*/
|
||||
})
|
||||
}
|
||||
|
@ -4,7 +4,7 @@ use spawning::SpawningFileAdapter;
|
||||
use std::process::Command;
|
||||
|
||||
// from https://github.com/jgm/pandoc/blob/master/src/Text/Pandoc/App/FormatHeuristics.hs
|
||||
// excluding formats that could cause problems (db = sqlite) or that are already text formats (e.g. xml-based)
|
||||
// excluding formats that could cause problems (.db ?= sqlite) or that are already text formats (e.g. xml-based)
|
||||
//"db" -> Just "docbook"
|
||||
//"adoc" -> Just "asciidoc"
|
||||
//"asciidoc" -> Just "asciidoc"
|
||||
@ -46,10 +46,11 @@ lazy_static! {
|
||||
name: "pandoc".to_owned(),
|
||||
version: 1,
|
||||
description: "Uses pandoc to convert binary/unreadable text documents to plain text markdown-like text".to_owned(),
|
||||
matchers: EXTENSIONS
|
||||
fast_matchers: EXTENSIONS
|
||||
.iter()
|
||||
.map(|s| Matcher::FileExtension(s.to_string()))
|
||||
.map(|s| FastMatcher::FileExtension(s.to_string()))
|
||||
.collect(),
|
||||
slow_matchers: None
|
||||
};
|
||||
}
|
||||
#[derive(Default)]
|
||||
|
@ -12,10 +12,11 @@ lazy_static! {
|
||||
version: 1,
|
||||
description: "Uses pdftotext (from poppler-utils) to extract plain text from PDF files"
|
||||
.to_owned(),
|
||||
matchers: EXTENSIONS
|
||||
fast_matchers: EXTENSIONS
|
||||
.iter()
|
||||
.map(|s| Matcher::FileExtension(s.to_string()))
|
||||
.map(|s| FastMatcher::FileExtension(s.to_string()))
|
||||
.collect(),
|
||||
slow_matchers: None
|
||||
};
|
||||
}
|
||||
#[derive(Default)]
|
||||
|
@ -14,10 +14,13 @@ lazy_static! {
|
||||
description:
|
||||
"Uses sqlite bindings to convert sqlite databases into a simple plain text format"
|
||||
.to_owned(),
|
||||
matchers: EXTENSIONS
|
||||
fast_matchers: EXTENSIONS
|
||||
.iter()
|
||||
.map(|s| Matcher::FileExtension(s.to_string()))
|
||||
.map(|s| FastMatcher::FileExtension(s.to_string()))
|
||||
.collect(),
|
||||
slow_matchers: Some(vec![SlowMatcher::MimeType(
|
||||
"application/x-sqlite3".to_owned()
|
||||
)])
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -13,10 +13,11 @@ lazy_static! {
|
||||
name: "tar".to_owned(),
|
||||
version: 1,
|
||||
description: "Reads a tar file as a stream and recurses down into its contents".to_owned(),
|
||||
matchers: EXTENSIONS
|
||||
fast_matchers: EXTENSIONS
|
||||
.iter()
|
||||
.map(|s| Matcher::FileExtension(s.to_string()))
|
||||
.map(|s| FastMatcher::FileExtension(s.to_string()))
|
||||
.collect(),
|
||||
slow_matchers: None
|
||||
};
|
||||
}
|
||||
#[derive(Default)]
|
||||
|
@ -14,10 +14,11 @@ lazy_static! {
|
||||
name: "zip".to_owned(),
|
||||
version: 1,
|
||||
description: "Reads a zip file as a stream and recurses down into its contents".to_owned(),
|
||||
matchers: EXTENSIONS
|
||||
fast_matchers: EXTENSIONS
|
||||
.iter()
|
||||
.map(|s| Matcher::FileExtension(s.to_string()))
|
||||
.map(|s| FastMatcher::FileExtension(s.to_string()))
|
||||
.collect(),
|
||||
slow_matchers: None
|
||||
};
|
||||
}
|
||||
#[derive(Default)]
|
||||
|
31
src/args.rs
31
src/args.rs
@ -32,58 +32,65 @@ set_default!(max_archive_recursion, 4, i32);
|
||||
#[structopt(rename_all = "kebab-case", set_term_width = 80)]
|
||||
pub struct RgaArgs {
|
||||
#[serde(default, skip_serializing_if = "is_default")]
|
||||
#[structopt(long, help = "Disable caching of results")]
|
||||
pub rga_no_cache: bool,
|
||||
#[structopt(long = "--rga-no-cache", help = "Disable caching of results")]
|
||||
pub no_cache: bool,
|
||||
|
||||
#[serde(default, skip_serializing_if = "is_default")]
|
||||
#[structopt(
|
||||
long,
|
||||
long = "--rga-accurate",
|
||||
help = "Use more accurate but slower matching by mime type"
|
||||
)]
|
||||
pub accurate: bool,
|
||||
|
||||
#[serde(default, skip_serializing_if = "is_default")]
|
||||
#[structopt(
|
||||
long = "--rga-adapters",
|
||||
require_equals = true,
|
||||
require_delimiter = true,
|
||||
help = "Change which adapters to use and in which priority order (descending)"
|
||||
)]
|
||||
pub rga_adapters: Vec<String>,
|
||||
pub adapters: Vec<String>,
|
||||
|
||||
#[serde(
|
||||
default = "def_cache_max_blob_len",
|
||||
skip_serializing_if = "def_cache_max_blob_len_if"
|
||||
)]
|
||||
#[structopt(
|
||||
long,
|
||||
long = "--rga-cache-max-blob-len",
|
||||
default_value = "2000000",
|
||||
help = "Max compressed size to cache",
|
||||
long_help = "Longest byte length (after compression) to store in cache. Longer adapter outputs will not be cached and recomputed every time."
|
||||
)]
|
||||
pub rga_cache_max_blob_len: u32,
|
||||
pub cache_max_blob_len: u32,
|
||||
|
||||
#[serde(
|
||||
default = "def_cache_compression_level",
|
||||
skip_serializing_if = "def_cache_compression_level_if"
|
||||
)]
|
||||
#[structopt(
|
||||
long,
|
||||
long = "--rga-cache-compression-level",
|
||||
default_value = "12",
|
||||
require_equals = true,
|
||||
help = "ZSTD compression level to apply to adapter outputs before storing in cache db"
|
||||
)]
|
||||
pub rga_cache_compression_level: u32,
|
||||
pub cache_compression_level: u32,
|
||||
|
||||
#[serde(
|
||||
default = "def_max_archive_recursion",
|
||||
skip_serializing_if = "def_max_archive_recursion_if"
|
||||
)]
|
||||
#[structopt(
|
||||
long,
|
||||
long = "--rga-max-archive-recursion",
|
||||
default_value = "4",
|
||||
require_equals = true,
|
||||
help = "Maximum nestedness of archives to recurse into"
|
||||
)]
|
||||
pub rga_max_archive_recursion: i32,
|
||||
pub max_archive_recursion: i32,
|
||||
|
||||
// these arguments stop the process, so don't serialize them
|
||||
#[serde(skip)]
|
||||
#[structopt(long, help = "List all known adapters")]
|
||||
pub rga_list_adapters: bool,
|
||||
#[structopt(long = "--rga-list-adapters", help = "List all known adapters")]
|
||||
pub list_adapters: bool,
|
||||
|
||||
#[serde(skip)]
|
||||
#[structopt(long, help = "Show help for ripgrep itself")]
|
||||
|
@ -21,7 +21,7 @@ fn main() -> Fallible<()> {
|
||||
|
||||
let i = File::open(&path)?;
|
||||
let mut o = std::io::stdout();
|
||||
let cache = if args.rga_no_cache {
|
||||
let cache = if args.no_cache {
|
||||
None
|
||||
} else {
|
||||
Some(rga::preproc_cache::open()?)
|
||||
|
@ -62,17 +62,17 @@ fn main() -> Fallible<()> {
|
||||
env_logger::init();
|
||||
|
||||
let (args, passthrough_args) = split_args()?;
|
||||
let adapters = get_adapters_filtered(&args.rga_adapters)?;
|
||||
let adapters = get_adapters_filtered(&args.adapters)?;
|
||||
|
||||
if args.rga_list_adapters {
|
||||
if args.list_adapters {
|
||||
println!("Adapters:\n");
|
||||
for adapter in adapters {
|
||||
let meta = adapter.metadata();
|
||||
let matchers = meta
|
||||
.matchers
|
||||
.fast_matchers
|
||||
.iter()
|
||||
.map(|m| match m {
|
||||
Matcher::FileExtension(ext) => format!(".{}", ext),
|
||||
FastMatcher::FileExtension(ext) => format!(".{}", ext),
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.join(", ");
|
||||
@ -87,9 +87,9 @@ fn main() -> Fallible<()> {
|
||||
|
||||
let extensions = adapters
|
||||
.iter()
|
||||
.flat_map(|a| &a.metadata().matchers)
|
||||
.flat_map(|a| &a.metadata().fast_matchers)
|
||||
.filter_map(|m| match m {
|
||||
Matcher::FileExtension(ext) => Some(ext as &str),
|
||||
FastMatcher::FileExtension(ext) => Some(ext as &str),
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.join(",");
|
||||
|
@ -47,7 +47,7 @@ impl<W: Write> Write for CachingWriter<W> {
|
||||
Some(writer) => {
|
||||
let wrote = writer.write(buf)?;
|
||||
let compressed_len = writer.get_ref().len();
|
||||
//eprintln!("wrote {} to zstd, len now {}", wrote, compressed_len);
|
||||
trace!("wrote {} to zstd, len now {}", wrote, compressed_len);
|
||||
if compressed_len > self.max_cache_size {
|
||||
eprintln!("cache longer than max, dropping");
|
||||
//writer.finish();
|
||||
|
@ -20,7 +20,6 @@ pub struct PreprocConfig<'a> {
|
||||
*
|
||||
*/
|
||||
pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> {
|
||||
let adapters = adapter_matcher(&ai.config.args.rga_adapters)?;
|
||||
let AdaptInfo {
|
||||
filepath_hint,
|
||||
is_real_file,
|
||||
@ -32,11 +31,12 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> {
|
||||
..
|
||||
} = ai;
|
||||
let PreprocConfig { mut cache, args } = config;
|
||||
let adapters = adapter_matcher(&args.adapters[..], args.accurate)?;
|
||||
let filename = filepath_hint
|
||||
.file_name()
|
||||
.ok_or_else(|| format_err!("Empty filename"))?;
|
||||
eprintln!("depth: {}", archive_recursion_depth);
|
||||
if archive_recursion_depth >= args.rga_max_archive_recursion {
|
||||
if archive_recursion_depth >= args.max_archive_recursion {
|
||||
writeln!(oup, "{}[rga: max archive recursion reached]", line_prefix)?;
|
||||
return Ok(());
|
||||
}
|
||||
@ -49,7 +49,7 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> {
|
||||
)))?;
|
||||
println!("mimetype: {:?}", mimetype);*/
|
||||
let adapter = adapters(FileMeta {
|
||||
// mimetype,
|
||||
mimetype: None,
|
||||
lossy_filename: filename.to_string_lossy().to_string(),
|
||||
});
|
||||
match adapter {
|
||||
@ -77,8 +77,8 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> {
|
||||
// wrapping BufWriter here gives ~10% perf boost
|
||||
let mut compbuf = BufWriter::new(CachingWriter::new(
|
||||
oup,
|
||||
args.rga_cache_max_blob_len.try_into().unwrap(),
|
||||
args.rga_cache_compression_level.try_into().unwrap(),
|
||||
args.cache_max_blob_len.try_into().unwrap(),
|
||||
args.cache_compression_level.try_into().unwrap(),
|
||||
)?);
|
||||
eprintln!("adapting...");
|
||||
ad.adapt(AdaptInfo {
|
||||
|
Loading…
Reference in New Issue
Block a user