custom adapter initial

This commit is contained in:
phiresky 2020-06-09 01:45:52 +02:00
parent 963524bbf5
commit 94099baeb3
6 changed files with 310 additions and 120 deletions

View File

@ -1,3 +1,4 @@
pub mod custom;
pub mod decompress; pub mod decompress;
pub mod ffmpeg; pub mod ffmpeg;
pub mod pandoc; pub mod pandoc;
@ -11,6 +12,7 @@ pub mod zip;
use crate::matching::*; use crate::matching::*;
use crate::preproc::PreprocConfig; use crate::preproc::PreprocConfig;
use anyhow::*; use anyhow::*;
use custom::CustomAdapterConfig;
use log::*; use log::*;
use regex::Regex; use regex::Regex;
use std::borrow::Cow; use std::borrow::Cow;
@ -79,9 +81,21 @@ pub struct AdaptInfo<'a> {
/// (enabledAdapters, disabledAdapters) /// (enabledAdapters, disabledAdapters)
type AdaptersTuple = (Vec<Rc<dyn FileAdapter>>, Vec<Rc<dyn FileAdapter>>); type AdaptersTuple = (Vec<Rc<dyn FileAdapter>>, Vec<Rc<dyn FileAdapter>>);
pub fn get_all_adapters() -> AdaptersTuple { pub fn get_all_adapters(custom_adapters: Option<Vec<CustomAdapterConfig>>) -> AdaptersTuple {
// order in descending priority // order in descending priority
let enabled_adapters: Vec<Rc<dyn FileAdapter>> = vec![ let mut enabled_adapters: Vec<Rc<dyn FileAdapter>> = vec![];
let mut disabled_adapters: Vec<Rc<dyn FileAdapter>> = vec![];
if let Some(custom_adapters) = custom_adapters {
for adapter_config in custom_adapters {
if adapter_config.default_disabled.unwrap_or(false) {
disabled_adapters.push(Rc::new(adapter_config.to_adapter()));
} else {
enabled_adapters.push(Rc::new(adapter_config.to_adapter()));
}
}
}
let internal_enabled_adapters: Vec<Rc<dyn FileAdapter>> = vec![
Rc::new(ffmpeg::FFmpegAdapter::new()), Rc::new(ffmpeg::FFmpegAdapter::new()),
Rc::new(pandoc::PandocAdapter::new()), Rc::new(pandoc::PandocAdapter::new()),
Rc::new(poppler::PopplerAdapter::new()), Rc::new(poppler::PopplerAdapter::new()),
@ -90,10 +104,12 @@ pub fn get_all_adapters() -> AdaptersTuple {
Rc::new(tar::TarAdapter::new()), Rc::new(tar::TarAdapter::new()),
Rc::new(sqlite::SqliteAdapter::new()), Rc::new(sqlite::SqliteAdapter::new()),
]; ];
let disabled_adapters: Vec<Rc<dyn FileAdapter>> = vec![ enabled_adapters.extend(internal_enabled_adapters);
let internal_disabled_adapters: Vec<Rc<dyn FileAdapter>> = vec![
Rc::new(pdfpages::PdfPagesAdapter::new()), Rc::new(pdfpages::PdfPagesAdapter::new()),
Rc::new(tesseract::TesseractAdapter::new()), Rc::new(tesseract::TesseractAdapter::new()),
]; ];
disabled_adapters.extend(internal_disabled_adapters);
(enabled_adapters, disabled_adapters) (enabled_adapters, disabled_adapters)
} }
@ -106,9 +122,10 @@ pub fn get_all_adapters() -> AdaptersTuple {
* - "+a,b" means use default list but also a and b (a,b will be prepended to the list so given higher priority) * - "+a,b" means use default list but also a and b (a,b will be prepended to the list so given higher priority)
*/ */
pub fn get_adapters_filtered<T: AsRef<str>>( pub fn get_adapters_filtered<T: AsRef<str>>(
adapter_names: &[T], custom_adapters: Option<Vec<CustomAdapterConfig>>,
adapter_names: &Vec<T>,
) -> Result<Vec<Rc<dyn FileAdapter>>> { ) -> Result<Vec<Rc<dyn FileAdapter>>> {
let (def_enabled_adapters, def_disabled_adapters) = get_all_adapters(); let (def_enabled_adapters, def_disabled_adapters) = get_all_adapters(custom_adapters);
let adapters = if !adapter_names.is_empty() { let adapters = if !adapter_names.is_empty() {
let adapters_map: HashMap<_, _> = def_enabled_adapters let adapters_map: HashMap<_, _> = def_enabled_adapters
.iter() .iter()

90
src/adapters/custom.rs Normal file
View File

@ -0,0 +1,90 @@
use super::{spawning::SpawningFileAdapter, AdapterMeta, GetMetadata};
use crate::{
matching::{FastMatcher, SlowMatcher},
project_dirs,
};
use anyhow::*;
use derive_more::FromStr;
use log::*;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use std::ffi::OsString;
use std::{fs::File, io::Write, iter::IntoIterator, str::FromStr};
use structopt::StructOpt;
// mostly the same as AdapterMeta + SpawningFileAdapter
#[derive(Debug, Deserialize, Serialize, JsonSchema, Default, PartialEq, Clone)]
pub struct CustomAdapterConfig {
/// the unique identifier and name of this adapter. Must only include a-z, 0-9, _
pub name: String,
/// a description of this adapter. shown in help
pub description: String,
/// if true, the adapter will be disabled by default
pub default_disabled: Option<bool>,
/// version identifier. used to key cache entries, change if the configuration or program changes
pub version: i32,
/// the file extensions this adapter supports. For example ["epub", "mobi"]
pub extensions: Vec<String>,
/// if not null and --rga-accurate is enabled, mime type matching is used instead of file name matching
pub mimetypes: Option<Vec<String>>,
/// the name or path of the binary to run
pub binary: String,
/// The arguments to run the program with. Placeholders:
/// {}: the file path (TODO)
/// stdin of the program will be connected to the input file, and stdout is assumed to be the converted file
pub args: Vec<String>,
}
pub struct CustomSpawningFileAdapter {
binary: String,
args: Vec<String>,
meta: AdapterMeta,
}
impl GetMetadata for CustomSpawningFileAdapter {
fn metadata(&self) -> &AdapterMeta {
&self.meta
}
}
impl SpawningFileAdapter for CustomSpawningFileAdapter {
fn get_exe(&self) -> &str {
&self.binary
}
fn command(
&self,
filepath_hint: &std::path::Path,
mut command: std::process::Command,
) -> std::process::Command {
command.args(&self.args);
command
}
}
impl CustomAdapterConfig {
pub fn to_adapter(self) -> CustomSpawningFileAdapter {
CustomSpawningFileAdapter {
binary: self.binary.clone(),
args: self.args.clone(),
meta: AdapterMeta {
name: self.name,
version: self.version,
description: format!(
"{}\nRuns: {} {}",
self.description,
self.binary,
self.args.join(" ")
),
recurses: false,
fast_matchers: self
.extensions
.iter()
.map(|s| FastMatcher::FileExtension(s.to_string()))
.collect(),
slow_matchers: self.mimetypes.map(|mimetypes| {
mimetypes
.iter()
.map(|s| SlowMatcher::MimeType(s.to_string()))
.collect()
}),
},
}
}
}

View File

@ -1,4 +1,4 @@
use crate::project_dirs; use crate::{adapters::custom::CustomAdapterConfig, project_dirs};
use anyhow::*; use anyhow::*;
use derive_more::FromStr; use derive_more::FromStr;
use log::*; use log::*;
@ -79,6 +79,12 @@ impl FromStr for CacheMaxBlobLen {
} }
} }
/// # rga configuration
///
/// this is kind of a "polyglot" struct, since it serves three functions
///
/// 1. describing the command line arguments using structopt+clap and for man page / readme generation
/// 2. describing the config file format (output as JSON schema via schemars)
#[derive(StructOpt, Debug, Deserialize, Serialize, JsonSchema, Default)] #[derive(StructOpt, Debug, Deserialize, Serialize, JsonSchema, Default)]
#[structopt( #[structopt(
name = "ripgrep-all", name = "ripgrep-all",
@ -89,16 +95,7 @@ impl FromStr for CacheMaxBlobLen {
after_help = "-h shows a concise overview, --help shows more detail and advanced options.\n\nAll other options not shown here are passed directly to rg, especially [PATTERN] and [PATH ...]", after_help = "-h shows a concise overview, --help shows more detail and advanced options.\n\nAll other options not shown here are passed directly to rg, especially [PATTERN] and [PATH ...]",
usage = "rga [RGA OPTIONS] [RG OPTIONS] PATTERN [PATH ...]" usage = "rga [RGA OPTIONS] [RG OPTIONS] PATTERN [PATH ...]"
)] )]
/// # rga configuration
///
/// this is kind of a "polyglot" struct, since it serves three functions
///
/// 1. describing the command line arguments using structopt+clap
/// 2. describing the config file format (output as JSON schema via schemars)
pub struct RgaConfig { pub struct RgaConfig {
#[serde(default, skip_serializing_if = "is_default")]
#[structopt(long = "--rga-no-cache")]
/// Disable caching of results /// Disable caching of results
/// ///
/// By default, rga caches the extracted text, if it is small enough, /// By default, rga caches the extracted text, if it is small enough,
@ -107,10 +104,10 @@ pub struct RgaConfig {
/// or C:\Users\username\AppData\Local\rga on Windows. /// or C:\Users\username\AppData\Local\rga on Windows.
/// This way, repeated searches on the same set of files will be much faster. /// This way, repeated searches on the same set of files will be much faster.
/// If you pass this flag, all caching will be disabled. /// If you pass this flag, all caching will be disabled.
#[serde(default, skip_serializing_if = "is_default")]
#[structopt(long = "--rga-no-cache")]
pub no_cache: bool, pub no_cache: bool,
#[serde(default, skip_serializing_if = "is_default")]
#[structopt(long = "--rga-accurate")]
/// Use more accurate but slower matching by mime type /// Use more accurate but slower matching by mime type
/// ///
/// By default, rga will match files using file extensions. /// By default, rga will match files using file extensions.
@ -119,21 +116,26 @@ pub struct RgaConfig {
/// will try to detect the mime type of input files using the magic bytes /// will try to detect the mime type of input files using the magic bytes
/// (similar to the `file` utility), and use that to choose the adapter. /// (similar to the `file` utility), and use that to choose the adapter.
/// Detection is only done on the first 8KiB of the file, since we can't always seek on the input (in archives). /// Detection is only done on the first 8KiB of the file, since we can't always seek on the input (in archives).
#[serde(default, skip_serializing_if = "is_default")]
#[structopt(long = "--rga-accurate")]
pub accurate: bool, pub accurate: bool,
/// Change which adapters to use and in which priority order (descending)
///
/// "foo,bar" means use only adapters foo and bar.
/// "-bar,baz" means use all default adapters except for bar and baz.
/// "+bar,baz" means use all default adapters and also bar and baz.
#[serde(default, skip_serializing_if = "is_default")] #[serde(default, skip_serializing_if = "is_default")]
#[structopt( #[structopt(
long = "--rga-adapters", long = "--rga-adapters",
require_equals = true, require_equals = true,
require_delimiter = true require_delimiter = true
)] )]
/// Change which adapters to use and in which priority order (descending)
///
/// "foo,bar" means use only adapters foo and bar.
/// "-bar,baz" means use all default adapters except for bar and baz.
/// "+bar,baz" means use all default adapters and also bar and baz.
pub adapters: Vec<String>, pub adapters: Vec<String>,
/// Max compressed size to cache
///
/// Longest byte length (after compression) to store in cache. Longer adapter outputs will not be cached and recomputed every time. Allowed suffixes: k M G
#[serde(default, skip_serializing_if = "is_default")] #[serde(default, skip_serializing_if = "is_default")]
#[structopt( #[structopt(
default_value, default_value,
@ -142,11 +144,11 @@ pub struct RgaConfig {
require_equals = true, require_equals = true,
// parse(try_from_str = parse_readable_bytes_str) // parse(try_from_str = parse_readable_bytes_str)
)] )]
/// Max compressed size to cache
///
/// Longest byte length (after compression) to store in cache. Longer adapter outputs will not be cached and recomputed every time. Allowed suffixes: k M G
pub cache_max_blob_len: CacheMaxBlobLen, pub cache_max_blob_len: CacheMaxBlobLen,
/// ZSTD compression level to apply to adapter outputs before storing in cache db
///
/// Ranges from 1 - 22
#[serde(default, skip_serializing_if = "is_default")] #[serde(default, skip_serializing_if = "is_default")]
#[structopt( #[structopt(
default_value, default_value,
@ -155,11 +157,9 @@ pub struct RgaConfig {
require_equals = true, require_equals = true,
help = "" help = ""
)] )]
/// ZSTD compression level to apply to adapter outputs before storing in cache db
///
/// Ranges from 1 - 22
pub cache_compression_level: CacheCompressionLevel, pub cache_compression_level: CacheCompressionLevel,
/// Maximum nestedness of archives to recurse into
#[serde(default, skip_serializing_if = "is_default")] #[serde(default, skip_serializing_if = "is_default")]
#[structopt( #[structopt(
default_value, default_value,
@ -167,13 +167,22 @@ pub struct RgaConfig {
require_equals = true, require_equals = true,
hidden_short_help = true hidden_short_help = true
)] )]
/// Maximum nestedness of archives to recurse into
pub max_archive_recursion: MaxArchiveRecursion, pub max_archive_recursion: MaxArchiveRecursion,
#[serde(skip)] //////////////////////////////////////////
#[structopt(long = "--rga-fzf-path", require_equals = true, hidden = true)] //////////////////////////// Config file only
//////////////////////////////////////////
#[serde(default, skip_serializing_if = "is_default")]
#[structopt(skip)]
pub custom_adapters: Option<Vec<CustomAdapterConfig>>,
//////////////////////////////////////////
//////////////////////////// CMD line only
//////////////////////////////////////////
/// same as passing path directly, except if argument is empty /// same as passing path directly, except if argument is empty
/// kinda hacky, but if no file is found, fzf calls rga with empty string as path, which causes No such file or directory from rg. So filter those cases and return specially /// kinda hacky, but if no file is found, fzf calls rga with empty string as path, which causes No such file or directory from rg. So filter those cases and return specially
#[serde(skip)]
#[structopt(long = "--rga-fzf-path", require_equals = true, hidden = true)]
pub fzf_path: Option<String>, pub fzf_path: Option<String>,
// these arguments are basically "subcommands" that stop the process, so don't serialize them // these arguments are basically "subcommands" that stop the process, so don't serialize them
@ -195,10 +204,6 @@ pub struct RgaConfig {
#[serde(skip)] #[serde(skip)]
#[structopt(long, help = "Show version of ripgrep itself")] #[structopt(long, help = "Show version of ripgrep itself")]
pub rg_version: bool, pub rg_version: bool,
#[serde(rename = "$schema", default = "default_schema_path")]
#[structopt(skip)]
pub _schema_key: String,
} }
fn default_schema_path() -> String { fn default_schema_path() -> String {
"./config.schema.json".to_string() "./config.schema.json".to_string()
@ -206,6 +211,21 @@ fn default_schema_path() -> String {
static RGA_CONFIG: &str = "RGA_CONFIG"; static RGA_CONFIG: &str = "RGA_CONFIG";
use serde_json::Value;
fn json_merge(a: &mut Value, b: &Value) {
match (a, b) {
(&mut Value::Object(ref mut a), &Value::Object(ref b)) => {
for (k, v) in b {
json_merge(a.entry(k.clone()).or_insert(Value::Null), v);
}
}
(a, b) => {
*a = b.clone();
}
}
}
// todo: this function is pretty inefficient. loads of json / copying stuff
pub fn parse_args<I>(args: I) -> Result<RgaConfig> pub fn parse_args<I>(args: I) -> Result<RgaConfig>
where where
I: IntoIterator, I: IntoIterator,
@ -213,42 +233,101 @@ where
{ {
let proj = project_dirs()?; let proj = project_dirs()?;
let config_dir = proj.config_dir(); let config_dir = proj.config_dir();
if config_dir.join("config.json").exists() { let config_filename = config_dir.join("config.json");
// todo: read config let config_file_config = {
if config_filename.exists() {
let config_file_contents =
std::fs::read_to_string(&config_filename).with_context(|| {
format!(
"Could not read config file json {}",
config_filename.to_string_lossy()
)
})?;
{
// just for error messages
let config_json: RgaConfig = serde_json::from_str(&config_file_contents)
.with_context(|| format!("Error in config file: {}", config_file_contents))?;
}
let config_json: serde_json::Value = serde_json::from_str(&config_file_contents)
.context("Could not parse config json")?;
log::debug!("Config JSON: {}", config_json.to_string());
config_json
} else { } else {
// write default config
std::fs::create_dir_all(config_dir)?; std::fs::create_dir_all(config_dir)?;
let mut schemafile = File::create(config_dir.join("config.schema.json"))?; let mut schemafile = File::create(config_dir.join("config.schema.json"))?;
schemafile schemafile.write(
.write(serde_json::to_string_pretty(&schemars::schema_for!(RgaConfig))?.as_bytes())?; serde_json::to_string_pretty(&schemars::schema_for!(RgaConfig))?.as_bytes(),
)?;
let mut configfile = File::create(config_dir.join("config.json"))?; let mut config_json = serde_json::to_value(&RgaConfig::default())?;
let mut v = serde_json::to_value(&RgaConfig::default())?; match &mut config_json {
match &mut v {
serde_json::Value::Object(o) => { serde_json::Value::Object(o) => {
o["$schema"] = serde_json::Value::String("./config.schema.json".to_string()) o.insert(
"$schema".to_string(),
serde_json::Value::String("./config.schema.json".to_string()),
);
} }
_ => panic!("impos"), _ => panic!("impos"),
} }
configfile.write(serde_json::to_string_pretty(&v)?.as_bytes())?; let mut configfile = File::create(config_dir.join("config.json"))?;
configfile.write(serde_json::to_string_pretty(&config_json)?.as_bytes())?;
config_json
} }
match std::env::var(RGA_CONFIG) { };
Ok(val) => { let env_var_config = {
debug!( let val = std::env::var(RGA_CONFIG).ok();
"Loading args from env {}={}, ignoring cmd args", if let Some(val) = val {
RGA_CONFIG, val serde_json::from_str(&val).context("could not parse config from env RGA_CONFIG")?
); } else {
Ok(serde_json::from_str(&val)?) serde_json::to_value(&RgaConfig::default())?
} }
Err(_) => { };
let matches = RgaConfig::from_iter(args);
let serialized_config = serde_json::to_string(&matches)?;
std::env::set_var(RGA_CONFIG, &serialized_config);
debug!("{}={}", RGA_CONFIG, serialized_config);
Ok(matches) let arg_matches = RgaConfig::from_iter(args);
} let args_config = {
let serialized_config = serde_json::to_value(&arg_matches)?;
serialized_config
};
log::debug!(
"Configs:\n{}: {}\n{}: {}\nArgs: {}",
config_filename.to_string_lossy(),
serde_json::to_string_pretty(&config_file_config)?,
RGA_CONFIG,
serde_json::to_string_pretty(&env_var_config)?,
serde_json::to_string_pretty(&args_config)?
);
let mut merged_config = config_file_config.clone();
json_merge(&mut merged_config, &env_var_config);
json_merge(&mut merged_config, &args_config);
log::debug!(
"Merged config: {}",
serde_json::to_string_pretty(&merged_config)?
);
let mut res: RgaConfig = serde_json::from_value(merged_config.clone())
.map_err(|e| {
println!("{:?}", e);
e
})
.with_context(|| {
format!(
"Error parsing merged config: {}",
serde_json::to_string_pretty(&merged_config).expect("no tostring")
)
})?;
{
// readd values with [serde(skip)]
res.fzf_path = arg_matches.fzf_path;
res.list_adapters = arg_matches.list_adapters;
res.print_config_schema = arg_matches.print_config_schema;
res.rg_help = arg_matches.rg_help;
res.rg_version = arg_matches.rg_version;
} }
Ok(res)
} }
/// Split arguments into the ones we care about and the ones rg cares about /// Split arguments into the ones we care about and the ones rg cares about
@ -278,7 +357,7 @@ pub fn split_args() -> Result<(RgaConfig, Vec<OsString>)> {
} }
}); });
debug!("our_args: {:?}", our_args); debug!("our_args: {:?}", our_args);
let matches = parse_args(our_args)?; let matches = parse_args(our_args).context("Could not parse args")?;
if matches.rg_help { if matches.rg_help {
passthrough_args.insert(0, "--help".into()); passthrough_args.insert(0, "--help".into());
} }

View File

@ -10,18 +10,8 @@ use structopt::StructOpt;
use schemars::schema_for; use schemars::schema_for;
use std::process::Command; use std::process::Command;
fn main() -> anyhow::Result<()> { fn list_adapters(args: RgaConfig) -> Result<()> {
env_logger::init(); let (enabled_adapters, disabled_adapters) = get_all_adapters(args.custom_adapters.clone());
let (args, mut passthrough_args) = split_args()?;
if args.print_config_schema {
println!("{}", serde_json::to_string_pretty(&schema_for!(RgaConfig))?);
return Ok(());
}
if args.list_adapters {
let (enabled_adapters, disabled_adapters) = get_all_adapters();
println!("Adapters:\n"); println!("Adapters:\n");
let print = |adapter: std::rc::Rc<dyn FileAdapter>| { let print = |adapter: std::rc::Rc<dyn FileAdapter>| {
@ -53,7 +43,7 @@ fn main() -> anyhow::Result<()> {
print!( print!(
" - **{name}**\n {desc} \n Extensions: {matchers} \n {mime} \n", " - **{name}**\n {desc} \n Extensions: {matchers} \n {mime} \n",
name = meta.name, name = meta.name,
desc = meta.description, desc = meta.description.replace("\n", "\n "),
matchers = matchers, matchers = matchers,
mime = mime_text mime = mime_text
); );
@ -68,6 +58,18 @@ fn main() -> anyhow::Result<()> {
} }
return Ok(()); return Ok(());
} }
fn main() -> anyhow::Result<()> {
env_logger::init();
let (args, mut passthrough_args) = split_args()?;
if args.print_config_schema {
println!("{}", serde_json::to_string_pretty(&schema_for!(RgaConfig))?);
return Ok(());
}
if args.list_adapters {
return list_adapters(args);
}
if let Some(path) = args.fzf_path { if let Some(path) = args.fzf_path {
if path == "_" { if path == "_" {
// fzf found no result, ignore everything and return // fzf found no result, ignore everything and return
@ -84,7 +86,7 @@ fn main() -> anyhow::Result<()> {
return Ok(()); return Ok(());
} }
let adapters = get_adapters_filtered(&args.adapters)?; let adapters = get_adapters_filtered(args.custom_adapters.clone(), &args.adapters)?;
let pre_glob = if !args.accurate { let pre_glob = if !args.accurate {
let extensions = adapters let extensions = adapters

View File

@ -46,11 +46,10 @@ pub fn extension_to_regex(extension: &str) -> Regex {
.expect("we know this regex compiles") .expect("we know this regex compiles")
} }
pub fn adapter_matcher<T: AsRef<str>>( pub fn adapter_matcher(
adapter_names: &[T], adapters: Vec<Rc<dyn FileAdapter>>,
slow: bool, slow: bool,
) -> Result<impl Fn(FileMeta) -> Option<(Rc<dyn FileAdapter>, SlowMatcher)>> { ) -> Result<impl Fn(FileMeta) -> Option<(Rc<dyn FileAdapter>, SlowMatcher)>> {
let adapters = get_adapters_filtered(adapter_names)?;
// need order later // need order later
let adapter_names: Vec<String> = adapters.iter().map(|e| e.metadata().name.clone()).collect(); let adapter_names: Vec<String> = adapters.iter().map(|e| e.metadata().name.clone()).collect();
let mut fname_regexes = vec![]; let mut fname_regexes = vec![];

View File

@ -34,7 +34,10 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<()> {
.. ..
} = ai; } = ai;
let PreprocConfig { mut cache, args } = config; let PreprocConfig { mut cache, args } = config;
let adapters = adapter_matcher(&args.adapters[..], args.accurate)?; let adapters = adapter_matcher(
get_adapters_filtered(args.custom_adapters.clone(), &args.adapters)?,
args.accurate,
)?;
let filename = filepath_hint let filename = filepath_hint
.file_name() .file_name()
.ok_or_else(|| format_err!("Empty filename"))?; .ok_or_else(|| format_err!("Empty filename"))?;