make poppler and pandoc internal custom adapters

This commit is contained in:
phiresky 2020-06-09 18:27:22 +02:00
parent 144b554f0d
commit 8070a94d84
16 changed files with 187 additions and 165 deletions

34
Cargo.lock generated
View File

@ -294,6 +294,12 @@ dependencies = [
"winapi",
]
[[package]]
name = "dyn-clone"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b3ec9c7fb9a2ce708751c98e31ccbae74b6ab194f5c8e30cfb7ed62e38b70866"
[[package]]
name = "either"
version = "1.5.3"
@ -479,6 +485,16 @@ dependencies = [
"unicode-normalization",
]
[[package]]
name = "indexmap"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c398b2b113b55809ceb9ee3e753fcbac793f1956663f3c36549c1346015c2afe"
dependencies = [
"autocfg 1.0.0",
"serde",
]
[[package]]
name = "itertools"
version = "0.9.0"
@ -862,9 +878,9 @@ checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
[[package]]
name = "quote"
version = "1.0.6"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "54a21852a652ad6f610c9510194f398ff6f8692e334fd1145fed931f7fbe44ea"
checksum = "aa563d17ecb180e500da1cfd2b028310ac758de548efdd203e18f283af693f37"
dependencies = [
"proc-macro2",
]
@ -1173,10 +1189,12 @@ checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e"
[[package]]
name = "schemars"
version = "0.7.6"
version = "0.8.0-alpha-2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be77ed66abed6954aabf6a3e31a84706bedbf93750d267e92ef4a6d90bbd6a61"
checksum = "a0d3111dca36beaa5be680b8d031d2416e5d0e66aac8118893d42792a6ea8996"
dependencies = [
"dyn-clone",
"indexmap",
"schemars_derive",
"serde",
"serde_json",
@ -1184,9 +1202,9 @@ dependencies = [
[[package]]
name = "schemars_derive"
version = "0.7.6"
version = "0.8.0-alpha-2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "11af7a475c9ee266cfaa9e303a47c830ebe072bf3101ab907a7b7b9d816fa01d"
checksum = "0e066c77ba237124b99881dfb3022cd7f4b477e19abcdfffd264c6693929a0a5"
dependencies = [
"proc-macro2",
"quote",
@ -1336,9 +1354,9 @@ dependencies = [
[[package]]
name = "synstructure"
version = "0.12.3"
version = "0.12.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67656ea1dc1b41b1451851562ea232ec2e5a80242139f7e679ceccfb5d61f545"
checksum = "b834f2d66f734cb897113e34aaff2f1ab4719ca946f9a7358dba8f8064148701"
dependencies = [
"proc-macro2",
"quote",

View File

@ -44,7 +44,7 @@ paste = "0.1.16"
tempfile = "3.1.0"
glob = "0.3.0"
anyhow = "1.0.31"
schemars = "0.7.6"
schemars = {version = "0.8.0-alpha-2", features = ["preserve_order"]}
directories-next = "1.0.1"
derive_more = "0.99.7"
pretty-bytes = "0.2.2"

View File

@ -61,7 +61,7 @@ On Arch Linux, you can simply install from AUR: `yay -S ripgrep-all`.
On Debian-based distributions you can download the [rga binary][latestrelease] and get the dependencies like this:
`apt install ripgrep pandoc poppler-utils ffmpeg cargo`
`apt install ripgrep pandoc poppler-utils ffmpeg`
If ripgrep is not included in your package sources, get it from [here](https://github.com/BurntSushi/ripgrep/releases).

26
doc/notes.md Normal file
View File

@ -0,0 +1,26 @@
## schema -> ui generation
https://json-schema.org/implementations.html#web-ui-generation
- https://github.com/guillotinaweb/ngx-schema-form
- https://github.com/hamzahamidi/ajsf angular igh
- https://github.com/dashjoin/json-schema-form
- https://github.com/json-editor/json-editor
- https://github.com/jsonform/jsonform
- https://github.com/vazco/uniforms
## json schema is ridiculous
"mimetypes": {
"description": "if not null and --rga-accurate is enabled, mime type matching is used instead of file name matching",
"type": [
"array",
"null"
],
"items": {
"type": "string"
}
},
what the fuck????
this is the only thing required to see that json schema has horrible design

View File

@ -1,7 +1,6 @@
pub mod custom;
pub mod decompress;
pub mod ffmpeg;
pub mod pandoc;
pub mod pdfpages;
pub mod poppler;
pub mod spawning;
@ -12,6 +11,7 @@ pub mod zip;
use crate::matching::*;
use crate::preproc::PreprocConfig;
use anyhow::*;
use custom::builtin_spawning_adapters;
use custom::CustomAdapterConfig;
use log::*;
use regex::Regex;
@ -35,6 +35,8 @@ pub struct AdapterMeta {
/// list of matchers when we have mime type detection active (interpreted as ORed)
/// warning: this *overrides* the fast matchers
pub slow_matchers: Option<Vec<SlowMatcher>>,
// if true, adapter is only used when user lists it in `--rga-adapters`
pub disabled_by_default: bool,
}
impl AdapterMeta {
// todo: this is pretty ugly
@ -83,34 +85,32 @@ type AdaptersTuple = (Vec<Rc<dyn FileAdapter>>, Vec<Rc<dyn FileAdapter>>);
pub fn get_all_adapters(custom_adapters: Option<Vec<CustomAdapterConfig>>) -> AdaptersTuple {
// order in descending priority
let mut enabled_adapters: Vec<Rc<dyn FileAdapter>> = vec![];
let mut disabled_adapters: Vec<Rc<dyn FileAdapter>> = vec![];
let mut adapters: Vec<Rc<dyn FileAdapter>> = vec![];
if let Some(custom_adapters) = custom_adapters {
for adapter_config in custom_adapters {
if adapter_config.default_disabled.unwrap_or(false) {
disabled_adapters.push(Rc::new(adapter_config.to_adapter()));
} else {
enabled_adapters.push(Rc::new(adapter_config.to_adapter()));
}
adapters.push(Rc::new(adapter_config.to_adapter()));
}
}
let internal_enabled_adapters: Vec<Rc<dyn FileAdapter>> = vec![
let internal_adapters: Vec<Rc<dyn FileAdapter>> = vec![
Rc::new(ffmpeg::FFmpegAdapter::new()),
Rc::new(pandoc::PandocAdapter::new()),
Rc::new(poppler::PopplerAdapter::new()),
Rc::new(zip::ZipAdapter::new()),
Rc::new(decompress::DecompressAdapter::new()),
Rc::new(tar::TarAdapter::new()),
Rc::new(sqlite::SqliteAdapter::new()),
];
enabled_adapters.extend(internal_enabled_adapters);
let internal_disabled_adapters: Vec<Rc<dyn FileAdapter>> = vec![
Rc::new(pdfpages::PdfPagesAdapter::new()),
Rc::new(tesseract::TesseractAdapter::new()),
];
disabled_adapters.extend(internal_disabled_adapters);
(enabled_adapters, disabled_adapters)
adapters.extend(
builtin_spawning_adapters
.iter()
.map(|e| -> Rc<dyn FileAdapter> { Rc::new(e.clone().to_adapter()) }),
);
adapters.extend(internal_adapters);
adapters
.into_iter()
.partition(|e| !e.metadata().disabled_by_default)
}
/**

View File

@ -1,6 +1,6 @@
use super::{spawning::SpawningFileAdapter, AdapterMeta, GetMetadata};
use crate::matching::{FastMatcher, SlowMatcher};
use lazy_static::lazy_static;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
@ -12,7 +12,7 @@ pub struct CustomAdapterConfig {
/// a description of this adapter. shown in help
pub description: String,
/// if true, the adapter will be disabled by default
pub default_disabled: Option<bool>,
pub disabled_by_default: Option<bool>,
/// version identifier. used to key cache entries, change if the configuration or program changes
pub version: i32,
/// the file extensions this adapter supports. For example ["epub", "mobi"]
@ -27,6 +27,81 @@ pub struct CustomAdapterConfig {
pub args: Vec<String>,
}
fn strs(arr: &[&str]) -> Vec<String> {
arr.iter().map(ToString::to_string).collect()
}
lazy_static! {
pub static ref builtin_spawning_adapters: Vec<CustomAdapterConfig> = vec![
// from https://github.com/jgm/pandoc/blob/master/src/Text/Pandoc/App/FormatHeuristics.hs
// excluding formats that could cause problems (.db ?= sqlite) or that are already text formats (e.g. xml-based)
//"db" -> Just "docbook"
//"adoc" -> Just "asciidoc"
//"asciidoc" -> Just "asciidoc"
//"context" -> Just "context"
//"ctx" -> Just "context"
//"dokuwiki" -> Just "dokuwiki"
//"htm" -> Just "html"
//"html" -> Just "html"
//"json" -> Just "json"
//"latex" -> Just "latex"
//"lhs" -> Just "markdown+lhs"
//"ltx" -> Just "latex"
//"markdown" -> Just "markdown"
//"md" -> Just "markdown"
//"ms" -> Just "ms"
//"muse" -> Just "muse"
//"native" -> Just "native"
//"opml" -> Just "opml"
//"org" -> Just "org"
//"roff" -> Just "ms"
//"rst" -> Just "rst"
//"s5" -> Just "s5"
//"t2t" -> Just "t2t"
//"tei" -> Just "tei"
//"tei.xml" -> Just "tei"
//"tex" -> Just "latex"
//"texi" -> Just "texinfo"
//"texinfo" -> Just "texinfo"
//"textile" -> Just "textile"
//"text" -> Just "markdown"
//"txt" -> Just "markdown"
//"xhtml" -> Just "html"
//"wiki" -> Just "mediawiki"
CustomAdapterConfig {
name: "pandoc".to_string(),
description: "Uses pandoc to convert binary/unreadable text documents to plain markdown-like text".to_string(),
version: 3,
extensions: strs(&["epub", "odt", "docx", "fb2", "ipynb"]),
binary: "pandoc".to_string(),
mimetypes: None,
// simpler markown (with more information loss but plainer text)
//.arg("--to=commonmark-header_attributes-link_attributes-fenced_divs-markdown_in_html_blocks-raw_html-native_divs-native_spans-bracketed_spans")
args: strs(&[
"--from={file_extension}",
"--to=plain",
"--wrap=none",
"--atx-headers"
]),
disabled_by_default: None
},
CustomAdapterConfig {
name: "poppler".to_owned(),
version: 1,
description: "Uses pdftotext (from poppler-utils) to extract plain text from PDF files"
.to_owned(),
extensions: strs(&["pdf"]),
mimetypes: Some(strs(&["application/pdf"])),
binary: "pdftotext".to_string(),
args: strs(&["-", "-"]),
disabled_by_default: None,
// postprocessors: [{name: "add_page_numbers_by_pagebreaks"}]
}
];
}
pub struct CustomSpawningFileAdapter {
binary: String,
args: Vec<String>,
@ -76,6 +151,7 @@ impl CustomAdapterConfig {
.map(|s| SlowMatcher::MimeType(s.to_string()))
.collect()
}),
disabled_by_default: self.disabled_by_default.unwrap_or(false),
},
}
}

View File

@ -30,6 +30,7 @@ lazy_static! {
.map(|s| SlowMatcher::MimeType(s.to_string()))
.collect()
),
disabled_by_default: false
};
}
#[derive(Default)]

View File

@ -21,7 +21,8 @@ lazy_static! {
.iter()
.map(|s| FastMatcher::FileExtension(s.to_string()))
.collect(),
slow_matchers: None
slow_matchers: None,
disabled_by_default: false
};
}

View File

@ -1,86 +0,0 @@
use super::*;
use lazy_static::lazy_static;
use spawning::SpawningFileAdapter;
use std::process::Command;
// from https://github.com/jgm/pandoc/blob/master/src/Text/Pandoc/App/FormatHeuristics.hs
// excluding formats that could cause problems (.db ?= sqlite) or that are already text formats (e.g. xml-based)
//"db" -> Just "docbook"
//"adoc" -> Just "asciidoc"
//"asciidoc" -> Just "asciidoc"
//"context" -> Just "context"
//"ctx" -> Just "context"
//"dokuwiki" -> Just "dokuwiki"
//"htm" -> Just "html"
//"html" -> Just "html"
//"json" -> Just "json"
//"latex" -> Just "latex"
//"lhs" -> Just "markdown+lhs"
//"ltx" -> Just "latex"
//"markdown" -> Just "markdown"
//"md" -> Just "markdown"
//"ms" -> Just "ms"
//"muse" -> Just "muse"
//"native" -> Just "native"
//"opml" -> Just "opml"
//"org" -> Just "org"
//"roff" -> Just "ms"
//"rst" -> Just "rst"
//"s5" -> Just "s5"
//"t2t" -> Just "t2t"
//"tei" -> Just "tei"
//"tei.xml" -> Just "tei"
//"tex" -> Just "latex"
//"texi" -> Just "texinfo"
//"texinfo" -> Just "texinfo"
//"textile" -> Just "textile"
//"text" -> Just "markdown"
//"txt" -> Just "markdown"
//"xhtml" -> Just "html"
//"wiki" -> Just "mediawiki"
static EXTENSIONS: &[&str] = &["epub", "odt", "docx", "fb2", "ipynb"];
lazy_static! {
static ref METADATA: AdapterMeta = AdapterMeta {
name: "pandoc".to_owned(),
version: 3,
description:
"Uses pandoc to convert binary/unreadable text documents to plain markdown-like text"
.to_owned(),
recurses: false,
fast_matchers: EXTENSIONS
.iter()
.map(|s| FastMatcher::FileExtension(s.to_string()))
.collect(),
slow_matchers: None
};
}
#[derive(Default)]
pub struct PandocAdapter;
impl PandocAdapter {
pub fn new() -> PandocAdapter {
PandocAdapter
}
}
impl GetMetadata for PandocAdapter {
fn metadata(&self) -> &AdapterMeta {
&METADATA
}
}
impl SpawningFileAdapter for PandocAdapter {
fn get_exe(&self) -> &str {
"pandoc"
}
fn command(&self, filepath_hint: &Path, mut cmd: Command) -> Command {
cmd.arg("--from")
.arg(filepath_hint.extension().unwrap())
// simpler markown (with more information loss but plainer text)
//.arg("--to=commonmark-header_attributes-link_attributes-fenced_divs-markdown_in_html_blocks-raw_html-native_divs-native_spans-bracketed_spans")
.arg("--to=plain")
.arg("--wrap=none")
.arg("--atx-headers");
cmd
}
}

View File

@ -22,7 +22,8 @@ lazy_static! {
.collect(),
slow_matchers: Some(vec![SlowMatcher::MimeType(
"application/pdf".to_owned()
)])
)]),
disabled_by_default: true
};
}
#[derive(Default)]

View File

@ -3,45 +3,11 @@ use lazy_static::lazy_static;
use spawning::SpawningFileAdapter;
use std::io::BufReader;
use std::process::Command;
/*
static EXTENSIONS: &[&str] = &["pdf"];
lazy_static! {
static ref METADATA: AdapterMeta = AdapterMeta {
name: "poppler".to_owned(),
version: 1,
description: "Uses pdftotext (from poppler-utils) to extract plain text from PDF files"
.to_owned(),
recurses: false,
fast_matchers: EXTENSIONS
.iter()
.map(|s| FastMatcher::FileExtension(s.to_string()))
.collect(),
slow_matchers: Some(vec![SlowMatcher::MimeType("application/pdf".to_owned())])
};
}
#[derive(Default)]
pub struct PopplerAdapter;
impl PopplerAdapter {
pub fn new() -> PopplerAdapter {
PopplerAdapter
}
}
impl GetMetadata for PopplerAdapter {
fn metadata(&self) -> &AdapterMeta {
&METADATA
}
}
impl SpawningFileAdapter for PopplerAdapter {
fn get_exe(&self) -> &str {
"pdftotext"
}
fn command(&self, _filepath_hint: &Path, mut cmd: Command) -> Command {
cmd.arg("-").arg("-");
cmd
}
postproc: "add_lines"
fn postproc(line_prefix: &str, inp: &mut dyn Read, oup: &mut dyn Write) -> Result<()> {
// prepend Page X to each line
let mut page = 1;
@ -60,3 +26,4 @@ impl SpawningFileAdapter for PopplerAdapter {
Ok(())
}
}
*/

View File

@ -22,7 +22,8 @@ lazy_static! {
.collect(),
slow_matchers: Some(vec![SlowMatcher::MimeType(
"application/x-sqlite3".to_owned()
)])
)]),
disabled_by_default: false
};
}

View File

@ -18,7 +18,8 @@ lazy_static! {
.iter()
.map(|s| FastMatcher::FileExtension(s.to_string()))
.collect(),
slow_matchers: None
slow_matchers: None,
disabled_by_default: false
};
}
#[derive(Default)]

View File

@ -15,7 +15,8 @@ lazy_static! {
.iter()
.map(|s| FastMatcher::FileExtension(s.to_string()))
.collect(),
slow_matchers: None
slow_matchers: None,
disabled_by_default: true
};
}
#[derive(Default)]

View File

@ -20,7 +20,8 @@ lazy_static! {
.iter()
.map(|s| FastMatcher::FileExtension(s.to_string()))
.collect(),
slow_matchers: Some(vec![SlowMatcher::MimeType("application/zip".to_owned())])
slow_matchers: Some(vec![SlowMatcher::MimeType("application/zip".to_owned())]),
disabled_by_default: false
};
}
#[derive(Default)]

View File

@ -5,7 +5,7 @@ use log::*;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use std::ffi::OsString;
use std::{fs::File, io::Write, iter::IntoIterator, str::FromStr};
use std::{fs::File, io::Write, iter::IntoIterator, path::PathBuf, str::FromStr};
use structopt::StructOpt;
#[derive(Debug, Deserialize, Serialize)]
@ -179,6 +179,10 @@ pub struct RgaConfig {
//////////////////////////////////////////
//////////////////////////// CMD line only
//////////////////////////////////////////
#[serde(skip)]
#[structopt(long = "--rga-config-file", require_equals = true)]
pub config_file_path: Option<String>,
/// same as passing path directly, except if argument is empty
/// kinda hacky, but if no file is found, fzf calls rga with empty string as path, which causes No such file or directory from rg. So filter those cases and return specially
#[serde(skip)]
@ -222,22 +226,31 @@ fn json_merge(a: &mut Value, b: &Value) {
}
}
fn read_config_file() -> Result<(String, Value)> {
fn read_config_file(path_override: Option<String>) -> Result<(String, Value)> {
let proj = project_dirs()?;
let config_dir = proj.config_dir();
let config_filename = config_dir.join("config.json");
let config_filename = path_override
.as_ref()
.map(|e| PathBuf::from(e))
.unwrap_or(config_dir.join("config.jsonc"));
let config_filename_str = config_filename.to_string_lossy().into_owned();
if config_filename.exists() {
let config_file_contents = std::fs::read_to_string(config_filename)
.with_context(|| format!("Could not read config file json {}", config_filename_str))?;
{
// just for error messages
serde_json::from_str(&config_file_contents)
.with_context(|| format!("Error in config file: {}", config_file_contents))?;
serde_json::from_str::<RgaConfig>(&config_file_contents).with_context(|| {
format!(
"Error in config file {}: {}",
config_filename_str, config_file_contents
)
})?;
}
let config_json: serde_json::Value =
serde_json::from_str(&config_file_contents).context("Could not parse config json")?;
Ok((config_filename_str, config_json))
} else if let Some(p) = path_override.as_ref() {
Err(anyhow::anyhow!("Config file not found: {}", p))?
} else {
// write default config
std::fs::create_dir_all(config_dir)?;
@ -256,7 +269,7 @@ fn read_config_file() -> Result<(String, Value)> {
}
_ => panic!("impos"),
}
let mut configfile = File::create(config_dir.join("config.json"))?;
let mut configfile = File::create(config_filename)?;
configfile.write(serde_json::to_string_pretty(&config_json)?.as_bytes())?;
Ok((config_filename_str, config_json))
}
@ -276,7 +289,7 @@ where
{
// TODO: don't read config file in rga-preproc for performance (called for every file)
let arg_matches = RgaConfig::from_iter(args);
let arg_matches: RgaConfig = RgaConfig::from_iter(args);
let args_config = serde_json::to_value(&arg_matches)?;
let merged_config = {
@ -288,8 +301,9 @@ where
merged_config
} else {
// read from config file, env and args
let (config_filename, config_file_config) =
read_config_file(arg_matches.config_file_path)?;
let env_var_config = read_config_env()?;
let (config_filename, config_file_config) = read_config_file()?;
let mut merged_config = config_file_config.clone();
json_merge(&mut merged_config, &env_var_config);
json_merge(&mut merged_config, &args_config);
@ -357,7 +371,7 @@ pub fn split_args(is_rga_preproc: bool) -> Result<(RgaConfig, Vec<OsString>)> {
}
});
debug!("rga (our) args: {:?}", our_args);
let matches = parse_args(our_args, is_rga_preproc).context("Could not parse args")?;
let matches = parse_args(our_args, is_rga_preproc).context("Could not parse config")?;
if matches.rg_help {
passthrough_args.insert(0, "--help".into());
}