From 1e9c2e45d6faca94b000cb5d73388f840157adf7 Mon Sep 17 00:00:00 2001 From: phiresky Date: Wed, 12 Jun 2019 17:23:30 +0200 Subject: [PATCH] add tesseract adapter --- Cargo.lock | 25 +++++++ Cargo.toml | 2 + README.md | 2 +- src/adapters.rs | 39 +++++++---- src/adapters/pdfpages.rs | 143 ++++++++++++++++++++++++++++++++++++++ src/adapters/sqlite.rs | 3 +- src/adapters/tesseract.rs | 42 +++++++++++ src/args.rs | 5 +- src/bin/rga-preproc.rs | 10 ++- src/bin/rga.rs | 13 +++- 10 files changed, 258 insertions(+), 26 deletions(-) create mode 100644 src/adapters/pdfpages.rs create mode 100644 src/adapters/tesseract.rs diff --git a/Cargo.lock b/Cargo.lock index b0cb1b8..6ac79c0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -902,6 +902,14 @@ dependencies = [ "ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "remove_dir_all" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "ripgrep_all" version = "0.5.1" @@ -917,6 +925,7 @@ dependencies = [ "env_logger 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", "failure 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", "flate2 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)", + "glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", "paste 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", @@ -929,6 +938,7 @@ dependencies = [ "size_format 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", "structopt 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", "tar 0.4.26 (registry+https://github.com/rust-lang/crates.io-index)", + "tempfile 3.0.8 (registry+https://github.com/rust-lang/crates.io-index)", "tree_magic_fork 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", "xz2 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", "zip 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)", @@ -1109,6 +1119,19 @@ dependencies = [ "xattr 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "tempfile" +version = "3.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.58 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_syscall 0.1.54 (registry+https://github.com/rust-lang/crates.io-index)", + "remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "termcolor" version = "1.0.5" @@ -1439,6 +1462,7 @@ dependencies = [ "checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76" "checksum regex 1.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "8f0a0bcab2fd7d1d7c54fa9eae6f43eddeb9ce2e7352f8518a814a4f65d60c58" "checksum regex-syntax 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)" = "dcfd8681eebe297b81d98498869d4aae052137651ad7b96822f09ceb690d0a96" +"checksum remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3488ba1b9a2084d38645c4c08276a1752dcbf2c7130d74f1569681ad5d2799c5" "checksum rkv 0.9.6 (registry+https://github.com/rust-lang/crates.io-index)" = "9ebeb7e046283b72b4bcd3b8ee4720cf69cd09f5b140b5ab46495df4af0e5113" "checksum rusqlite 0.18.0 (registry+https://github.com/rust-lang/crates.io-index)" = "700720c977deb8b91c9d881dcbe3309c254d414078ca3856ea6647e569be3b66" "checksum rustc-demangle 0.1.15 (registry+https://github.com/rust-lang/crates.io-index)" = "a7f4dccf6f4891ebcc0c39f9b6eb1a83b9bf5d747cb439ec6fba4f3b977038af" @@ -1459,6 +1483,7 @@ dependencies = [ "checksum syn 0.15.34 (registry+https://github.com/rust-lang/crates.io-index)" = "a1393e4a97a19c01e900df2aec855a29f71cf02c402e2f443b8d2747c25c5dbe" "checksum synstructure 0.10.2 (registry+https://github.com/rust-lang/crates.io-index)" = "02353edf96d6e4dc81aea2d8490a7e9db177bf8acb0e951c24940bf866cb313f" "checksum tar 0.4.26 (registry+https://github.com/rust-lang/crates.io-index)" = "b3196bfbffbba3e57481b6ea32249fbaf590396a52505a2615adbb79d9d826d3" +"checksum tempfile 3.0.8 (registry+https://github.com/rust-lang/crates.io-index)" = "7dc4738f2e68ed2855de5ac9cdbe05c9216773ecde4739b2f095002ab03a13ef" "checksum termcolor 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "96d6098003bde162e4277c70665bd87c326f5a0c3f3fbfb285787fa482d54e6e" "checksum termion 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "dde0593aeb8d47accea5392b39350015b5eccb12c0d98044d856983d89548dea" "checksum textwrap 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" diff --git a/Cargo.toml b/Cargo.toml index c07be7b..de19f91 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,3 +41,5 @@ rusqlite = { version = "0.18.0", features=["vtab"] } # "bundled" size_format = "1.0.2" structopt = "0.2.16" paste = "0.1.5" +tempfile = "*" +glob = "*" \ No newline at end of file diff --git a/README.md b/README.md index 2de8fe9..158d50f 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ similar: To enable debug logging: ```bash -export RUST_LOG=rga=debug +export RUST_LOG=debug export RUST_BACKTRACE=1 ``` diff --git a/src/adapters.rs b/src/adapters.rs index a8c8a48..c75f969 100644 --- a/src/adapters.rs +++ b/src/adapters.rs @@ -1,9 +1,11 @@ pub mod ffmpeg; pub mod pandoc; +pub mod pdfpages; pub mod poppler; pub mod spawning; pub mod sqlite; pub mod tar; +pub mod tesseract; pub mod zip; use crate::matching::*; use crate::preproc::PreprocConfig; @@ -69,18 +71,19 @@ pub struct AdaptInfo<'a> { pub config: PreprocConfig<'a>, } -pub fn get_adapters() -> (Vec>, Vec>) { +pub fn get_all_adapters() -> (Vec>, Vec>) { // order in descending priority let enabled_adapters: Vec> = vec![ - Rc::new(ffmpeg::FFmpegAdapter), - Rc::new(pandoc::PandocAdapter), - Rc::new(poppler::PopplerAdapter), - Rc::new(zip::ZipAdapter), - Rc::new(tar::TarAdapter), - Rc::new(sqlite::SqliteAdapter), + Rc::new(ffmpeg::FFmpegAdapter::new()), + Rc::new(pandoc::PandocAdapter::new()), + Rc::new(poppler::PopplerAdapter::new()), + Rc::new(zip::ZipAdapter::new()), + Rc::new(tar::TarAdapter::new()), + Rc::new(sqlite::SqliteAdapter::new()), ]; let disabled_adapters: Vec> = vec![ - //Rc::new() + Rc::new(pdfpages::PdfPagesAdapter::new()), + Rc::new(tesseract::TesseractAdapter::new()), ]; (enabled_adapters, disabled_adapters) } @@ -89,13 +92,14 @@ pub fn get_adapters() -> (Vec>, Vec>) { * filter adapters by given names: * * - "" means use default enabled adapter list + * - "a,b" means use adapters a,b * - "-a,b" means use default list except for a and b * - "+a,b" means use default list but also a and b */ pub fn get_adapters_filtered>( adapter_names: &[T], ) -> Fallible>> { - let (def_enabled_adapters, def_disabled_adapters) = get_adapters(); + let (def_enabled_adapters, def_disabled_adapters) = get_all_adapters(); let adapters = if !adapter_names.is_empty() { let adapters_map: HashMap<_, _> = def_enabled_adapters .iter() @@ -104,6 +108,7 @@ pub fn get_adapters_filtered>( .collect(); let mut adapters = vec![]; let mut subtractive = false; + let mut additive = false; for (i, name) in adapter_names.iter().enumerate() { let mut name = name.as_ref(); if i == 0 && (name.starts_with('-')) { @@ -113,6 +118,7 @@ pub fn get_adapters_filtered>( } else if i == 0 && (name.starts_with('+')) { name = &name[1..]; adapters = def_enabled_adapters.clone(); + additive = true; } if subtractive { let inx = adapters @@ -121,12 +127,15 @@ pub fn get_adapters_filtered>( .ok_or_else(|| format_err!("Could not remove {}: Not in list", name))?; adapters.remove(inx); } else { - adapters.push( - adapters_map - .get(name) - .ok_or_else(|| format_err!("Unknown adapter: \"{}\"", name))? - .clone(), - ); + let adapter = adapters_map + .get(name) + .ok_or_else(|| format_err!("Unknown adapter: \"{}\"", name))? + .clone(); + if additive { + adapters.insert(0, adapter); + } else { + adapters.push(adapter); + } } } adapters diff --git a/src/adapters/pdfpages.rs b/src/adapters/pdfpages.rs new file mode 100644 index 0000000..7931317 --- /dev/null +++ b/src/adapters/pdfpages.rs @@ -0,0 +1,143 @@ +use super::*; +use crate::adapters::spawning::map_exe_error; +use crate::adapters::spawning::pipe_output; +use crate::preproc::rga_preproc; +use lazy_static::lazy_static; +use spawning::SpawningFileAdapter; +use std::fs::File; +use std::io::BufReader; +use std::io::Cursor; +use std::io::Take; +use std::path::PathBuf; +use std::process::Command; +use std::process::Stdio; + +static EXTENSIONS: &[&str] = &["pdf"]; + +lazy_static! { + static ref METADATA: AdapterMeta = AdapterMeta { + name: "pdfpages".to_owned(), + version: 1, + description: "Converts a pdf to it's individual pages as png files".to_owned(), + fast_matchers: EXTENSIONS + .iter() + .map(|s| FastMatcher::FileExtension(s.to_string())) + .collect(), + slow_matchers: None + }; +} +#[derive(Default)] +pub struct PdfPagesAdapter {} + +impl PdfPagesAdapter { + pub fn new() -> PdfPagesAdapter { + PdfPagesAdapter {} + } +} + +impl GetMetadata for PdfPagesAdapter { + fn metadata(&self) -> &AdapterMeta { + &METADATA + } +} + +/*// todo: do this in an actually streaming fashion and less slow +// IEND chunk + PDF magic +// 4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a +let split_seq = hex_literal::hex!("4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a"); +let split_seq_inx = 8; +fn split_by_seq<'a>( + split_seq: &'a [u8], + split_inx: usize, + read: &mut Read, +) -> Fallible + 'a> { + let regex = split_seq + .iter() + .map(|c| format!("\\x{:0>2x}", c)) + .collect::>() + .join(""); + let restr = format!("(?-u){}", regex); + eprintln!("re: {}", restr); + let re = regex::bytes::Regex::new(&restr)?; + + let mut all = Vec::new(); + read.read_to_end(&mut all)?; + let mut out: Vec>> = Vec::new(); + let mut last = 0; + for (i, split) in re.find_iter(&all).enumerate() { + let pos = split.start() + split_inx; + out.push(Cursor::new(Vec::from(&all[last..pos]))); + last = pos; + } + out.push(Cursor::new(Vec::from(&all[last..]))); + Ok(out) +}*/ + +impl FileAdapter for PdfPagesAdapter { + fn adapt(&self, ai: AdaptInfo) -> Fallible<()> { + let AdaptInfo { + filepath_hint, + is_real_file, + mut inp, + oup, + line_prefix, + archive_recursion_depth, + config, + .. + } = ai; + if !is_real_file { + // todo: read to memory and then use that blob if size < max + writeln!(oup, "{}[rga: skipping pdfpages in archive]", line_prefix,)?; + return Ok(()); + } + let inp_fname = filepath_hint; + let exe_name = "gm"; + let out_dir = tempfile::Builder::new().prefix("pdfpages-").tempdir()?; + let out_fname = out_dir.path().join("out%04d.png"); + eprintln!("writing to temp dir: {}", out_fname.display()); + let mut cmd = Command::new(exe_name); + cmd.arg("convert") + .arg("-density") + .arg("300") + .arg(inp_fname) + .arg("+adjoin") + .arg(out_fname); + + let mut cmd = cmd.spawn().map_err(|e| { + map_exe_error( + e, + exe_name, + "Could not find gm. Make sure you have graphicsmagick installed.", + ) + })?; + let args = config.args; + // TODO: how to handle this copying better? + + let status = cmd.wait()?; + if status.success() { + } else { + return Err(format_err!("subprocess failed: {:?}", status)); + } + for (i, filename) in glob::glob( + out_dir + .path() + .join("out*.png") + .to_str() + .expect("temp path has invalid encoding"), + )? + .enumerate() + { + let mut ele = BufReader::new(File::open(filename?)?); + rga_preproc(AdaptInfo { + filepath_hint: &PathBuf::from(format!("Page {}.png", i + 1)), + is_real_file: false, + inp: &mut ele, + oup, + line_prefix, + archive_recursion_depth: archive_recursion_depth + 1, + config: PreprocConfig { cache: None, args }, + })?; + } + Ok(()) + } +} diff --git a/src/adapters/sqlite.rs b/src/adapters/sqlite.rs index e9e0266..8b4891e 100644 --- a/src/adapters/sqlite.rs +++ b/src/adapters/sqlite.rs @@ -97,7 +97,8 @@ impl FileAdapter for SqliteAdapter { while let Some(row) = z.next()? { writeln!( oup, - "{}: {}", + "{}{}: {}", + line_prefix, table, col_names .iter() diff --git a/src/adapters/tesseract.rs b/src/adapters/tesseract.rs new file mode 100644 index 0000000..fc7bea9 --- /dev/null +++ b/src/adapters/tesseract.rs @@ -0,0 +1,42 @@ +use super::*; +use lazy_static::lazy_static; +use spawning::SpawningFileAdapter; +use std::process::Command; + +static EXTENSIONS: &[&str] = &["jpg", "png"]; + +lazy_static! { + static ref METADATA: AdapterMeta = AdapterMeta { + name: "tesseract".to_owned(), + version: 1, + description: "Uses tesseract to run OCR on images to make them searchable. May need -j1 to prevent overloading the system. Make sure you have tesseract installed.".to_owned(), + fast_matchers: EXTENSIONS + .iter() + .map(|s| FastMatcher::FileExtension(s.to_string())) + .collect(), + slow_matchers: None + }; +} +#[derive(Default)] +pub struct TesseractAdapter {} + +impl TesseractAdapter { + pub fn new() -> TesseractAdapter { + TesseractAdapter {} + } +} + +impl GetMetadata for TesseractAdapter { + fn metadata(&self) -> &AdapterMeta { + &METADATA + } +} +impl SpawningFileAdapter for TesseractAdapter { + fn get_exe(&self) -> &str { + "tesseract" + } + fn command(&self, _filepath_hint: &Path, mut cmd: Command) -> Command { + cmd.arg("-").arg("-"); + cmd + } +} diff --git a/src/args.rs b/src/args.rs index e7da226..868f45d 100644 --- a/src/args.rs +++ b/src/args.rs @@ -110,7 +110,10 @@ where { match std::env::var(RGA_CONFIG) { Ok(val) => { - debug!("Loading args from env {}={}", RGA_CONFIG, val); + debug!( + "Loading args from env {}={}, ignoring cmd args", + RGA_CONFIG, val + ); Ok(serde_json::from_str(&val)?) } Err(_) => { diff --git a/src/bin/rga-preproc.rs b/src/bin/rga-preproc.rs index e0258a8..d323e29 100644 --- a/src/bin/rga-preproc.rs +++ b/src/bin/rga-preproc.rs @@ -7,14 +7,12 @@ use std::fs::File; fn main() -> Fallible<()> { env_logger::init(); - let empty: Vec = vec![]; - let args = rga::args::parse_args(empty)?; + let mut arg_arr: Vec = std::env::args_os().collect(); + let last = arg_arr.pop().expect("No filename specified"); + let args = rga::args::parse_args(arg_arr)?; //clap::App::new("rga-preproc").arg(Arg::from_usage()) let path = { - let filepath = std::env::args_os() - .skip(1) - .next() - .ok_or(format_err!("No filename specified"))?; + let filepath = last; eprintln!("inp fname: {:?}", filepath); std::env::current_dir()?.join(&filepath) }; diff --git a/src/bin/rga.rs b/src/bin/rga.rs index aa4dafb..1324a70 100644 --- a/src/bin/rga.rs +++ b/src/bin/rga.rs @@ -11,11 +11,12 @@ fn main() -> Fallible<()> { env_logger::init(); let (args, passthrough_args) = split_args()?; - let adapters = get_adapters_filtered(&args.adapters)?; if args.list_adapters { + let (enabled_adapters, disabled_adapters) = get_all_adapters(); + println!("Adapters:\n"); - for adapter in adapters { + let print = |adapter: std::rc::Rc| { let meta = adapter.metadata(); let matchers = meta .fast_matchers @@ -30,9 +31,17 @@ fn main() -> Fallible<()> { meta.name, meta.description, matchers ); println!(""); + }; + for adapter in enabled_adapters { + print(adapter) + } + println!("The following adapters are disabled by default, and can be enabled using '--rga-adapters=+tesseract,xyz':\n"); + for adapter in disabled_adapters { + print(adapter) } return Ok(()); } + let adapters = get_adapters_filtered(&args.adapters)?; let pre_glob = if !args.accurate { let extensions = adapters