From 94099baeb3f877f6b21b33907caa270edd61a834 Mon Sep 17 00:00:00 2001 From: phiresky Date: Tue, 9 Jun 2020 01:45:52 +0200 Subject: [PATCH] custom adapter initial --- src/adapters.rs | 27 +++++- src/adapters/custom.rs | 90 ++++++++++++++++++ src/args.rs | 205 ++++++++++++++++++++++++++++------------- src/bin/rga.rs | 98 ++++++++++---------- src/matching.rs | 5 +- src/preproc.rs | 5 +- 6 files changed, 310 insertions(+), 120 deletions(-) create mode 100644 src/adapters/custom.rs diff --git a/src/adapters.rs b/src/adapters.rs index 8d5f942..33b067b 100644 --- a/src/adapters.rs +++ b/src/adapters.rs @@ -1,3 +1,4 @@ +pub mod custom; pub mod decompress; pub mod ffmpeg; pub mod pandoc; @@ -11,6 +12,7 @@ pub mod zip; use crate::matching::*; use crate::preproc::PreprocConfig; use anyhow::*; +use custom::CustomAdapterConfig; use log::*; use regex::Regex; use std::borrow::Cow; @@ -79,9 +81,21 @@ pub struct AdaptInfo<'a> { /// (enabledAdapters, disabledAdapters) type AdaptersTuple = (Vec>, Vec>); -pub fn get_all_adapters() -> AdaptersTuple { +pub fn get_all_adapters(custom_adapters: Option>) -> AdaptersTuple { // order in descending priority - let enabled_adapters: Vec> = vec![ + let mut enabled_adapters: Vec> = vec![]; + let mut disabled_adapters: Vec> = vec![]; + if let Some(custom_adapters) = custom_adapters { + for adapter_config in custom_adapters { + if adapter_config.default_disabled.unwrap_or(false) { + disabled_adapters.push(Rc::new(adapter_config.to_adapter())); + } else { + enabled_adapters.push(Rc::new(adapter_config.to_adapter())); + } + } + } + + let internal_enabled_adapters: Vec> = vec![ Rc::new(ffmpeg::FFmpegAdapter::new()), Rc::new(pandoc::PandocAdapter::new()), Rc::new(poppler::PopplerAdapter::new()), @@ -90,10 +104,12 @@ pub fn get_all_adapters() -> AdaptersTuple { Rc::new(tar::TarAdapter::new()), Rc::new(sqlite::SqliteAdapter::new()), ]; - let disabled_adapters: Vec> = vec![ + enabled_adapters.extend(internal_enabled_adapters); + let internal_disabled_adapters: Vec> = vec![ Rc::new(pdfpages::PdfPagesAdapter::new()), Rc::new(tesseract::TesseractAdapter::new()), ]; + disabled_adapters.extend(internal_disabled_adapters); (enabled_adapters, disabled_adapters) } @@ -106,9 +122,10 @@ pub fn get_all_adapters() -> AdaptersTuple { * - "+a,b" means use default list but also a and b (a,b will be prepended to the list so given higher priority) */ pub fn get_adapters_filtered>( - adapter_names: &[T], + custom_adapters: Option>, + adapter_names: &Vec, ) -> Result>> { - let (def_enabled_adapters, def_disabled_adapters) = get_all_adapters(); + let (def_enabled_adapters, def_disabled_adapters) = get_all_adapters(custom_adapters); let adapters = if !adapter_names.is_empty() { let adapters_map: HashMap<_, _> = def_enabled_adapters .iter() diff --git a/src/adapters/custom.rs b/src/adapters/custom.rs new file mode 100644 index 0000000..f4eaa7a --- /dev/null +++ b/src/adapters/custom.rs @@ -0,0 +1,90 @@ +use super::{spawning::SpawningFileAdapter, AdapterMeta, GetMetadata}; +use crate::{ + matching::{FastMatcher, SlowMatcher}, + project_dirs, +}; +use anyhow::*; +use derive_more::FromStr; +use log::*; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use std::ffi::OsString; +use std::{fs::File, io::Write, iter::IntoIterator, str::FromStr}; +use structopt::StructOpt; + +// mostly the same as AdapterMeta + SpawningFileAdapter +#[derive(Debug, Deserialize, Serialize, JsonSchema, Default, PartialEq, Clone)] +pub struct CustomAdapterConfig { + /// the unique identifier and name of this adapter. Must only include a-z, 0-9, _ + pub name: String, + /// a description of this adapter. shown in help + pub description: String, + /// if true, the adapter will be disabled by default + pub default_disabled: Option, + /// version identifier. used to key cache entries, change if the configuration or program changes + pub version: i32, + /// the file extensions this adapter supports. For example ["epub", "mobi"] + pub extensions: Vec, + /// if not null and --rga-accurate is enabled, mime type matching is used instead of file name matching + pub mimetypes: Option>, + /// the name or path of the binary to run + pub binary: String, + /// The arguments to run the program with. Placeholders: + /// {}: the file path (TODO) + /// stdin of the program will be connected to the input file, and stdout is assumed to be the converted file + pub args: Vec, +} + +pub struct CustomSpawningFileAdapter { + binary: String, + args: Vec, + meta: AdapterMeta, +} +impl GetMetadata for CustomSpawningFileAdapter { + fn metadata(&self) -> &AdapterMeta { + &self.meta + } +} +impl SpawningFileAdapter for CustomSpawningFileAdapter { + fn get_exe(&self) -> &str { + &self.binary + } + fn command( + &self, + filepath_hint: &std::path::Path, + mut command: std::process::Command, + ) -> std::process::Command { + command.args(&self.args); + command + } +} +impl CustomAdapterConfig { + pub fn to_adapter(self) -> CustomSpawningFileAdapter { + CustomSpawningFileAdapter { + binary: self.binary.clone(), + args: self.args.clone(), + meta: AdapterMeta { + name: self.name, + version: self.version, + description: format!( + "{}\nRuns: {} {}", + self.description, + self.binary, + self.args.join(" ") + ), + recurses: false, + fast_matchers: self + .extensions + .iter() + .map(|s| FastMatcher::FileExtension(s.to_string())) + .collect(), + slow_matchers: self.mimetypes.map(|mimetypes| { + mimetypes + .iter() + .map(|s| SlowMatcher::MimeType(s.to_string())) + .collect() + }), + }, + } + } +} diff --git a/src/args.rs b/src/args.rs index cb27d32..d22cc9f 100644 --- a/src/args.rs +++ b/src/args.rs @@ -1,4 +1,4 @@ -use crate::project_dirs; +use crate::{adapters::custom::CustomAdapterConfig, project_dirs}; use anyhow::*; use derive_more::FromStr; use log::*; @@ -79,6 +79,12 @@ impl FromStr for CacheMaxBlobLen { } } +/// # rga configuration +/// +/// this is kind of a "polyglot" struct, since it serves three functions +/// +/// 1. describing the command line arguments using structopt+clap and for man page / readme generation +/// 2. describing the config file format (output as JSON schema via schemars) #[derive(StructOpt, Debug, Deserialize, Serialize, JsonSchema, Default)] #[structopt( name = "ripgrep-all", @@ -89,16 +95,7 @@ impl FromStr for CacheMaxBlobLen { after_help = "-h shows a concise overview, --help shows more detail and advanced options.\n\nAll other options not shown here are passed directly to rg, especially [PATTERN] and [PATH ...]", usage = "rga [RGA OPTIONS] [RG OPTIONS] PATTERN [PATH ...]" )] - -/// # rga configuration -/// -/// this is kind of a "polyglot" struct, since it serves three functions -/// -/// 1. describing the command line arguments using structopt+clap -/// 2. describing the config file format (output as JSON schema via schemars) pub struct RgaConfig { - #[serde(default, skip_serializing_if = "is_default")] - #[structopt(long = "--rga-no-cache")] /// Disable caching of results /// /// By default, rga caches the extracted text, if it is small enough, @@ -107,10 +104,10 @@ pub struct RgaConfig { /// or C:\Users\username\AppData\Local\rga on Windows. /// This way, repeated searches on the same set of files will be much faster. /// If you pass this flag, all caching will be disabled. + #[serde(default, skip_serializing_if = "is_default")] + #[structopt(long = "--rga-no-cache")] pub no_cache: bool, - #[serde(default, skip_serializing_if = "is_default")] - #[structopt(long = "--rga-accurate")] /// Use more accurate but slower matching by mime type /// /// By default, rga will match files using file extensions. @@ -119,21 +116,26 @@ pub struct RgaConfig { /// will try to detect the mime type of input files using the magic bytes /// (similar to the `file` utility), and use that to choose the adapter. /// Detection is only done on the first 8KiB of the file, since we can't always seek on the input (in archives). + #[serde(default, skip_serializing_if = "is_default")] + #[structopt(long = "--rga-accurate")] pub accurate: bool, + /// Change which adapters to use and in which priority order (descending) + /// + /// "foo,bar" means use only adapters foo and bar. + /// "-bar,baz" means use all default adapters except for bar and baz. + /// "+bar,baz" means use all default adapters and also bar and baz. #[serde(default, skip_serializing_if = "is_default")] #[structopt( long = "--rga-adapters", require_equals = true, require_delimiter = true )] - /// Change which adapters to use and in which priority order (descending) - /// - /// "foo,bar" means use only adapters foo and bar. - /// "-bar,baz" means use all default adapters except for bar and baz. - /// "+bar,baz" means use all default adapters and also bar and baz. pub adapters: Vec, + /// Max compressed size to cache + /// + /// Longest byte length (after compression) to store in cache. Longer adapter outputs will not be cached and recomputed every time. Allowed suffixes: k M G #[serde(default, skip_serializing_if = "is_default")] #[structopt( default_value, @@ -142,11 +144,11 @@ pub struct RgaConfig { require_equals = true, // parse(try_from_str = parse_readable_bytes_str) )] - /// Max compressed size to cache - /// - /// Longest byte length (after compression) to store in cache. Longer adapter outputs will not be cached and recomputed every time. Allowed suffixes: k M G pub cache_max_blob_len: CacheMaxBlobLen, + /// ZSTD compression level to apply to adapter outputs before storing in cache db + /// + /// Ranges from 1 - 22 #[serde(default, skip_serializing_if = "is_default")] #[structopt( default_value, @@ -155,11 +157,9 @@ pub struct RgaConfig { require_equals = true, help = "" )] - /// ZSTD compression level to apply to adapter outputs before storing in cache db - /// - /// Ranges from 1 - 22 pub cache_compression_level: CacheCompressionLevel, + /// Maximum nestedness of archives to recurse into #[serde(default, skip_serializing_if = "is_default")] #[structopt( default_value, @@ -167,13 +167,22 @@ pub struct RgaConfig { require_equals = true, hidden_short_help = true )] - /// Maximum nestedness of archives to recurse into pub max_archive_recursion: MaxArchiveRecursion, - #[serde(skip)] - #[structopt(long = "--rga-fzf-path", require_equals = true, hidden = true)] + ////////////////////////////////////////// + //////////////////////////// Config file only + ////////////////////////////////////////// + #[serde(default, skip_serializing_if = "is_default")] + #[structopt(skip)] + pub custom_adapters: Option>, + + ////////////////////////////////////////// + //////////////////////////// CMD line only + ////////////////////////////////////////// /// same as passing path directly, except if argument is empty /// kinda hacky, but if no file is found, fzf calls rga with empty string as path, which causes No such file or directory from rg. So filter those cases and return specially + #[serde(skip)] + #[structopt(long = "--rga-fzf-path", require_equals = true, hidden = true)] pub fzf_path: Option, // these arguments are basically "subcommands" that stop the process, so don't serialize them @@ -195,10 +204,6 @@ pub struct RgaConfig { #[serde(skip)] #[structopt(long, help = "Show version of ripgrep itself")] pub rg_version: bool, - - #[serde(rename = "$schema", default = "default_schema_path")] - #[structopt(skip)] - pub _schema_key: String, } fn default_schema_path() -> String { "./config.schema.json".to_string() @@ -206,6 +211,21 @@ fn default_schema_path() -> String { static RGA_CONFIG: &str = "RGA_CONFIG"; +use serde_json::Value; +fn json_merge(a: &mut Value, b: &Value) { + match (a, b) { + (&mut Value::Object(ref mut a), &Value::Object(ref b)) => { + for (k, v) in b { + json_merge(a.entry(k.clone()).or_insert(Value::Null), v); + } + } + (a, b) => { + *a = b.clone(); + } + } +} + +// todo: this function is pretty inefficient. loads of json / copying stuff pub fn parse_args(args: I) -> Result where I: IntoIterator, @@ -213,42 +233,101 @@ where { let proj = project_dirs()?; let config_dir = proj.config_dir(); - if config_dir.join("config.json").exists() { - // todo: read config - } else { - std::fs::create_dir_all(config_dir)?; - let mut schemafile = File::create(config_dir.join("config.schema.json"))?; - - schemafile - .write(serde_json::to_string_pretty(&schemars::schema_for!(RgaConfig))?.as_bytes())?; - - let mut configfile = File::create(config_dir.join("config.json"))?; - let mut v = serde_json::to_value(&RgaConfig::default())?; - match &mut v { - serde_json::Value::Object(o) => { - o["$schema"] = serde_json::Value::String("./config.schema.json".to_string()) + let config_filename = config_dir.join("config.json"); + let config_file_config = { + if config_filename.exists() { + let config_file_contents = + std::fs::read_to_string(&config_filename).with_context(|| { + format!( + "Could not read config file json {}", + config_filename.to_string_lossy() + ) + })?; + { + // just for error messages + let config_json: RgaConfig = serde_json::from_str(&config_file_contents) + .with_context(|| format!("Error in config file: {}", config_file_contents))?; } - _ => panic!("impos"), - } - configfile.write(serde_json::to_string_pretty(&v)?.as_bytes())?; - } - match std::env::var(RGA_CONFIG) { - Ok(val) => { - debug!( - "Loading args from env {}={}, ignoring cmd args", - RGA_CONFIG, val - ); - Ok(serde_json::from_str(&val)?) - } - Err(_) => { - let matches = RgaConfig::from_iter(args); - let serialized_config = serde_json::to_string(&matches)?; - std::env::set_var(RGA_CONFIG, &serialized_config); - debug!("{}={}", RGA_CONFIG, serialized_config); + let config_json: serde_json::Value = serde_json::from_str(&config_file_contents) + .context("Could not parse config json")?; + log::debug!("Config JSON: {}", config_json.to_string()); + config_json + } else { + // write default config + std::fs::create_dir_all(config_dir)?; + let mut schemafile = File::create(config_dir.join("config.schema.json"))?; - Ok(matches) + schemafile.write( + serde_json::to_string_pretty(&schemars::schema_for!(RgaConfig))?.as_bytes(), + )?; + + let mut config_json = serde_json::to_value(&RgaConfig::default())?; + match &mut config_json { + serde_json::Value::Object(o) => { + o.insert( + "$schema".to_string(), + serde_json::Value::String("./config.schema.json".to_string()), + ); + } + _ => panic!("impos"), + } + let mut configfile = File::create(config_dir.join("config.json"))?; + configfile.write(serde_json::to_string_pretty(&config_json)?.as_bytes())?; + config_json } + }; + let env_var_config = { + let val = std::env::var(RGA_CONFIG).ok(); + if let Some(val) = val { + serde_json::from_str(&val).context("could not parse config from env RGA_CONFIG")? + } else { + serde_json::to_value(&RgaConfig::default())? + } + }; + + let arg_matches = RgaConfig::from_iter(args); + let args_config = { + let serialized_config = serde_json::to_value(&arg_matches)?; + + serialized_config + }; + + log::debug!( + "Configs:\n{}: {}\n{}: {}\nArgs: {}", + config_filename.to_string_lossy(), + serde_json::to_string_pretty(&config_file_config)?, + RGA_CONFIG, + serde_json::to_string_pretty(&env_var_config)?, + serde_json::to_string_pretty(&args_config)? + ); + let mut merged_config = config_file_config.clone(); + json_merge(&mut merged_config, &env_var_config); + json_merge(&mut merged_config, &args_config); + + log::debug!( + "Merged config: {}", + serde_json::to_string_pretty(&merged_config)? + ); + let mut res: RgaConfig = serde_json::from_value(merged_config.clone()) + .map_err(|e| { + println!("{:?}", e); + e + }) + .with_context(|| { + format!( + "Error parsing merged config: {}", + serde_json::to_string_pretty(&merged_config).expect("no tostring") + ) + })?; + { + // readd values with [serde(skip)] + res.fzf_path = arg_matches.fzf_path; + res.list_adapters = arg_matches.list_adapters; + res.print_config_schema = arg_matches.print_config_schema; + res.rg_help = arg_matches.rg_help; + res.rg_version = arg_matches.rg_version; } + Ok(res) } /// Split arguments into the ones we care about and the ones rg cares about @@ -278,7 +357,7 @@ pub fn split_args() -> Result<(RgaConfig, Vec)> { } }); debug!("our_args: {:?}", our_args); - let matches = parse_args(our_args)?; + let matches = parse_args(our_args).context("Could not parse args")?; if matches.rg_help { passthrough_args.insert(0, "--help".into()); } diff --git a/src/bin/rga.rs b/src/bin/rga.rs index df31627..7dea264 100644 --- a/src/bin/rga.rs +++ b/src/bin/rga.rs @@ -10,6 +10,54 @@ use structopt::StructOpt; use schemars::schema_for; use std::process::Command; +fn list_adapters(args: RgaConfig) -> Result<()> { + let (enabled_adapters, disabled_adapters) = get_all_adapters(args.custom_adapters.clone()); + + println!("Adapters:\n"); + let print = |adapter: std::rc::Rc| { + let meta = adapter.metadata(); + let matchers = meta + .fast_matchers + .iter() + .map(|m| match m { + FastMatcher::FileExtension(ext) => format!(".{}", ext), + }) + .collect::>() + .join(", "); + let slow_matchers = meta + .slow_matchers + .as_ref() + .unwrap_or(&vec![]) + .iter() + .filter_map(|m| match m { + SlowMatcher::MimeType(x) => Some(format!("{}", x)), + SlowMatcher::Fast(_) => None, + }) + .collect::>() + .join(", "); + let mime_text = if slow_matchers.is_empty() { + "".to_owned() + } else { + format!("Mime Types: {}", slow_matchers) + }; + print!( + " - **{name}**\n {desc} \n Extensions: {matchers} \n {mime} \n", + name = meta.name, + desc = meta.description.replace("\n", "\n "), + matchers = matchers, + mime = mime_text + ); + println!(""); + }; + for adapter in enabled_adapters { + print(adapter) + } + println!("The following adapters are disabled by default, and can be enabled using '--rga-adapters=+pdfpages,tesseract':\n"); + for adapter in disabled_adapters { + print(adapter) + } + return Ok(()); +} fn main() -> anyhow::Result<()> { env_logger::init(); @@ -19,54 +67,8 @@ fn main() -> anyhow::Result<()> { println!("{}", serde_json::to_string_pretty(&schema_for!(RgaConfig))?); return Ok(()); } - if args.list_adapters { - let (enabled_adapters, disabled_adapters) = get_all_adapters(); - - println!("Adapters:\n"); - let print = |adapter: std::rc::Rc| { - let meta = adapter.metadata(); - let matchers = meta - .fast_matchers - .iter() - .map(|m| match m { - FastMatcher::FileExtension(ext) => format!(".{}", ext), - }) - .collect::>() - .join(", "); - let slow_matchers = meta - .slow_matchers - .as_ref() - .unwrap_or(&vec![]) - .iter() - .filter_map(|m| match m { - SlowMatcher::MimeType(x) => Some(format!("{}", x)), - SlowMatcher::Fast(_) => None, - }) - .collect::>() - .join(", "); - let mime_text = if slow_matchers.is_empty() { - "".to_owned() - } else { - format!("Mime Types: {}", slow_matchers) - }; - print!( - " - **{name}**\n {desc} \n Extensions: {matchers} \n {mime} \n", - name = meta.name, - desc = meta.description, - matchers = matchers, - mime = mime_text - ); - println!(""); - }; - for adapter in enabled_adapters { - print(adapter) - } - println!("The following adapters are disabled by default, and can be enabled using '--rga-adapters=+pdfpages,tesseract':\n"); - for adapter in disabled_adapters { - print(adapter) - } - return Ok(()); + return list_adapters(args); } if let Some(path) = args.fzf_path { if path == "_" { @@ -84,7 +86,7 @@ fn main() -> anyhow::Result<()> { return Ok(()); } - let adapters = get_adapters_filtered(&args.adapters)?; + let adapters = get_adapters_filtered(args.custom_adapters.clone(), &args.adapters)?; let pre_glob = if !args.accurate { let extensions = adapters diff --git a/src/matching.rs b/src/matching.rs index 5cf6f05..297faf2 100644 --- a/src/matching.rs +++ b/src/matching.rs @@ -46,11 +46,10 @@ pub fn extension_to_regex(extension: &str) -> Regex { .expect("we know this regex compiles") } -pub fn adapter_matcher>( - adapter_names: &[T], +pub fn adapter_matcher( + adapters: Vec>, slow: bool, ) -> Result Option<(Rc, SlowMatcher)>> { - let adapters = get_adapters_filtered(adapter_names)?; // need order later let adapter_names: Vec = adapters.iter().map(|e| e.metadata().name.clone()).collect(); let mut fname_regexes = vec![]; diff --git a/src/preproc.rs b/src/preproc.rs index ef4845f..5cebe0a 100644 --- a/src/preproc.rs +++ b/src/preproc.rs @@ -34,7 +34,10 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<()> { .. } = ai; let PreprocConfig { mut cache, args } = config; - let adapters = adapter_matcher(&args.adapters[..], args.accurate)?; + let adapters = adapter_matcher( + get_adapters_filtered(args.custom_adapters.clone(), &args.adapters)?, + args.accurate, + )?; let filename = filepath_hint .file_name() .ok_or_else(|| format_err!("Empty filename"))?;