add tesseract adapter

This commit is contained in:
phiresky 2019-06-12 17:23:30 +02:00
parent d1b55e80b7
commit 1e9c2e45d6
10 changed files with 258 additions and 26 deletions

25
Cargo.lock generated
View File

@ -902,6 +902,14 @@ dependencies = [
"ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "remove_dir_all"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "ripgrep_all"
version = "0.5.1"
@ -917,6 +925,7 @@ dependencies = [
"env_logger 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
"failure 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
"flate2 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)",
"glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
"paste 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
@ -929,6 +938,7 @@ dependencies = [
"size_format 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
"structopt 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
"tar 0.4.26 (registry+https://github.com/rust-lang/crates.io-index)",
"tempfile 3.0.8 (registry+https://github.com/rust-lang/crates.io-index)",
"tree_magic_fork 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
"xz2 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
"zip 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
@ -1109,6 +1119,19 @@ dependencies = [
"xattr 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "tempfile"
version = "3.0.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.58 (registry+https://github.com/rust-lang/crates.io-index)",
"rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)",
"redox_syscall 0.1.54 (registry+https://github.com/rust-lang/crates.io-index)",
"remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "termcolor"
version = "1.0.5"
@ -1439,6 +1462,7 @@ dependencies = [
"checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76"
"checksum regex 1.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "8f0a0bcab2fd7d1d7c54fa9eae6f43eddeb9ce2e7352f8518a814a4f65d60c58"
"checksum regex-syntax 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)" = "dcfd8681eebe297b81d98498869d4aae052137651ad7b96822f09ceb690d0a96"
"checksum remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3488ba1b9a2084d38645c4c08276a1752dcbf2c7130d74f1569681ad5d2799c5"
"checksum rkv 0.9.6 (registry+https://github.com/rust-lang/crates.io-index)" = "9ebeb7e046283b72b4bcd3b8ee4720cf69cd09f5b140b5ab46495df4af0e5113"
"checksum rusqlite 0.18.0 (registry+https://github.com/rust-lang/crates.io-index)" = "700720c977deb8b91c9d881dcbe3309c254d414078ca3856ea6647e569be3b66"
"checksum rustc-demangle 0.1.15 (registry+https://github.com/rust-lang/crates.io-index)" = "a7f4dccf6f4891ebcc0c39f9b6eb1a83b9bf5d747cb439ec6fba4f3b977038af"
@ -1459,6 +1483,7 @@ dependencies = [
"checksum syn 0.15.34 (registry+https://github.com/rust-lang/crates.io-index)" = "a1393e4a97a19c01e900df2aec855a29f71cf02c402e2f443b8d2747c25c5dbe"
"checksum synstructure 0.10.2 (registry+https://github.com/rust-lang/crates.io-index)" = "02353edf96d6e4dc81aea2d8490a7e9db177bf8acb0e951c24940bf866cb313f"
"checksum tar 0.4.26 (registry+https://github.com/rust-lang/crates.io-index)" = "b3196bfbffbba3e57481b6ea32249fbaf590396a52505a2615adbb79d9d826d3"
"checksum tempfile 3.0.8 (registry+https://github.com/rust-lang/crates.io-index)" = "7dc4738f2e68ed2855de5ac9cdbe05c9216773ecde4739b2f095002ab03a13ef"
"checksum termcolor 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "96d6098003bde162e4277c70665bd87c326f5a0c3f3fbfb285787fa482d54e6e"
"checksum termion 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "dde0593aeb8d47accea5392b39350015b5eccb12c0d98044d856983d89548dea"
"checksum textwrap 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"

View File

@ -41,3 +41,5 @@ rusqlite = { version = "0.18.0", features=["vtab"] } # "bundled"
size_format = "1.0.2"
structopt = "0.2.16"
paste = "0.1.5"
tempfile = "*"
glob = "*"

View File

@ -18,7 +18,7 @@ similar:
To enable debug logging:
```bash
export RUST_LOG=rga=debug
export RUST_LOG=debug
export RUST_BACKTRACE=1
```

View File

@ -1,9 +1,11 @@
pub mod ffmpeg;
pub mod pandoc;
pub mod pdfpages;
pub mod poppler;
pub mod spawning;
pub mod sqlite;
pub mod tar;
pub mod tesseract;
pub mod zip;
use crate::matching::*;
use crate::preproc::PreprocConfig;
@ -69,18 +71,19 @@ pub struct AdaptInfo<'a> {
pub config: PreprocConfig<'a>,
}
pub fn get_adapters() -> (Vec<Rc<dyn FileAdapter>>, Vec<Rc<dyn FileAdapter>>) {
pub fn get_all_adapters() -> (Vec<Rc<dyn FileAdapter>>, Vec<Rc<dyn FileAdapter>>) {
// order in descending priority
let enabled_adapters: Vec<Rc<dyn FileAdapter>> = vec![
Rc::new(ffmpeg::FFmpegAdapter),
Rc::new(pandoc::PandocAdapter),
Rc::new(poppler::PopplerAdapter),
Rc::new(zip::ZipAdapter),
Rc::new(tar::TarAdapter),
Rc::new(sqlite::SqliteAdapter),
Rc::new(ffmpeg::FFmpegAdapter::new()),
Rc::new(pandoc::PandocAdapter::new()),
Rc::new(poppler::PopplerAdapter::new()),
Rc::new(zip::ZipAdapter::new()),
Rc::new(tar::TarAdapter::new()),
Rc::new(sqlite::SqliteAdapter::new()),
];
let disabled_adapters: Vec<Rc<dyn FileAdapter>> = vec![
//Rc::new()
Rc::new(pdfpages::PdfPagesAdapter::new()),
Rc::new(tesseract::TesseractAdapter::new()),
];
(enabled_adapters, disabled_adapters)
}
@ -89,13 +92,14 @@ pub fn get_adapters() -> (Vec<Rc<dyn FileAdapter>>, Vec<Rc<dyn FileAdapter>>) {
* filter adapters by given names:
*
* - "" means use default enabled adapter list
* - "a,b" means use adapters a,b
* - "-a,b" means use default list except for a and b
* - "+a,b" means use default list but also a and b
*/
pub fn get_adapters_filtered<T: AsRef<str>>(
adapter_names: &[T],
) -> Fallible<Vec<Rc<dyn FileAdapter>>> {
let (def_enabled_adapters, def_disabled_adapters) = get_adapters();
let (def_enabled_adapters, def_disabled_adapters) = get_all_adapters();
let adapters = if !adapter_names.is_empty() {
let adapters_map: HashMap<_, _> = def_enabled_adapters
.iter()
@ -104,6 +108,7 @@ pub fn get_adapters_filtered<T: AsRef<str>>(
.collect();
let mut adapters = vec![];
let mut subtractive = false;
let mut additive = false;
for (i, name) in adapter_names.iter().enumerate() {
let mut name = name.as_ref();
if i == 0 && (name.starts_with('-')) {
@ -113,6 +118,7 @@ pub fn get_adapters_filtered<T: AsRef<str>>(
} else if i == 0 && (name.starts_with('+')) {
name = &name[1..];
adapters = def_enabled_adapters.clone();
additive = true;
}
if subtractive {
let inx = adapters
@ -121,12 +127,15 @@ pub fn get_adapters_filtered<T: AsRef<str>>(
.ok_or_else(|| format_err!("Could not remove {}: Not in list", name))?;
adapters.remove(inx);
} else {
adapters.push(
adapters_map
.get(name)
.ok_or_else(|| format_err!("Unknown adapter: \"{}\"", name))?
.clone(),
);
let adapter = adapters_map
.get(name)
.ok_or_else(|| format_err!("Unknown adapter: \"{}\"", name))?
.clone();
if additive {
adapters.insert(0, adapter);
} else {
adapters.push(adapter);
}
}
}
adapters

143
src/adapters/pdfpages.rs Normal file
View File

@ -0,0 +1,143 @@
use super::*;
use crate::adapters::spawning::map_exe_error;
use crate::adapters::spawning::pipe_output;
use crate::preproc::rga_preproc;
use lazy_static::lazy_static;
use spawning::SpawningFileAdapter;
use std::fs::File;
use std::io::BufReader;
use std::io::Cursor;
use std::io::Take;
use std::path::PathBuf;
use std::process::Command;
use std::process::Stdio;
static EXTENSIONS: &[&str] = &["pdf"];
lazy_static! {
static ref METADATA: AdapterMeta = AdapterMeta {
name: "pdfpages".to_owned(),
version: 1,
description: "Converts a pdf to it's individual pages as png files".to_owned(),
fast_matchers: EXTENSIONS
.iter()
.map(|s| FastMatcher::FileExtension(s.to_string()))
.collect(),
slow_matchers: None
};
}
#[derive(Default)]
pub struct PdfPagesAdapter {}
impl PdfPagesAdapter {
pub fn new() -> PdfPagesAdapter {
PdfPagesAdapter {}
}
}
impl GetMetadata for PdfPagesAdapter {
fn metadata(&self) -> &AdapterMeta {
&METADATA
}
}
/*// todo: do this in an actually streaming fashion and less slow
// IEND chunk + PDF magic
// 4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a
let split_seq = hex_literal::hex!("4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a");
let split_seq_inx = 8;
fn split_by_seq<'a>(
split_seq: &'a [u8],
split_inx: usize,
read: &mut Read,
) -> Fallible<impl IntoIterator<Item = impl Read> + 'a> {
let regex = split_seq
.iter()
.map(|c| format!("\\x{:0>2x}", c))
.collect::<Vec<_>>()
.join("");
let restr = format!("(?-u){}", regex);
eprintln!("re: {}", restr);
let re = regex::bytes::Regex::new(&restr)?;
let mut all = Vec::new();
read.read_to_end(&mut all)?;
let mut out: Vec<Cursor<Vec<u8>>> = Vec::new();
let mut last = 0;
for (i, split) in re.find_iter(&all).enumerate() {
let pos = split.start() + split_inx;
out.push(Cursor::new(Vec::from(&all[last..pos])));
last = pos;
}
out.push(Cursor::new(Vec::from(&all[last..])));
Ok(out)
}*/
impl FileAdapter for PdfPagesAdapter {
fn adapt(&self, ai: AdaptInfo) -> Fallible<()> {
let AdaptInfo {
filepath_hint,
is_real_file,
mut inp,
oup,
line_prefix,
archive_recursion_depth,
config,
..
} = ai;
if !is_real_file {
// todo: read to memory and then use that blob if size < max
writeln!(oup, "{}[rga: skipping pdfpages in archive]", line_prefix,)?;
return Ok(());
}
let inp_fname = filepath_hint;
let exe_name = "gm";
let out_dir = tempfile::Builder::new().prefix("pdfpages-").tempdir()?;
let out_fname = out_dir.path().join("out%04d.png");
eprintln!("writing to temp dir: {}", out_fname.display());
let mut cmd = Command::new(exe_name);
cmd.arg("convert")
.arg("-density")
.arg("300")
.arg(inp_fname)
.arg("+adjoin")
.arg(out_fname);
let mut cmd = cmd.spawn().map_err(|e| {
map_exe_error(
e,
exe_name,
"Could not find gm. Make sure you have graphicsmagick installed.",
)
})?;
let args = config.args;
// TODO: how to handle this copying better?
let status = cmd.wait()?;
if status.success() {
} else {
return Err(format_err!("subprocess failed: {:?}", status));
}
for (i, filename) in glob::glob(
out_dir
.path()
.join("out*.png")
.to_str()
.expect("temp path has invalid encoding"),
)?
.enumerate()
{
let mut ele = BufReader::new(File::open(filename?)?);
rga_preproc(AdaptInfo {
filepath_hint: &PathBuf::from(format!("Page {}.png", i + 1)),
is_real_file: false,
inp: &mut ele,
oup,
line_prefix,
archive_recursion_depth: archive_recursion_depth + 1,
config: PreprocConfig { cache: None, args },
})?;
}
Ok(())
}
}

View File

@ -97,7 +97,8 @@ impl FileAdapter for SqliteAdapter {
while let Some(row) = z.next()? {
writeln!(
oup,
"{}: {}",
"{}{}: {}",
line_prefix,
table,
col_names
.iter()

42
src/adapters/tesseract.rs Normal file
View File

@ -0,0 +1,42 @@
use super::*;
use lazy_static::lazy_static;
use spawning::SpawningFileAdapter;
use std::process::Command;
static EXTENSIONS: &[&str] = &["jpg", "png"];
lazy_static! {
static ref METADATA: AdapterMeta = AdapterMeta {
name: "tesseract".to_owned(),
version: 1,
description: "Uses tesseract to run OCR on images to make them searchable. May need -j1 to prevent overloading the system. Make sure you have tesseract installed.".to_owned(),
fast_matchers: EXTENSIONS
.iter()
.map(|s| FastMatcher::FileExtension(s.to_string()))
.collect(),
slow_matchers: None
};
}
#[derive(Default)]
pub struct TesseractAdapter {}
impl TesseractAdapter {
pub fn new() -> TesseractAdapter {
TesseractAdapter {}
}
}
impl GetMetadata for TesseractAdapter {
fn metadata(&self) -> &AdapterMeta {
&METADATA
}
}
impl SpawningFileAdapter for TesseractAdapter {
fn get_exe(&self) -> &str {
"tesseract"
}
fn command(&self, _filepath_hint: &Path, mut cmd: Command) -> Command {
cmd.arg("-").arg("-");
cmd
}
}

View File

@ -110,7 +110,10 @@ where
{
match std::env::var(RGA_CONFIG) {
Ok(val) => {
debug!("Loading args from env {}={}", RGA_CONFIG, val);
debug!(
"Loading args from env {}={}, ignoring cmd args",
RGA_CONFIG, val
);
Ok(serde_json::from_str(&val)?)
}
Err(_) => {

View File

@ -7,14 +7,12 @@ use std::fs::File;
fn main() -> Fallible<()> {
env_logger::init();
let empty: Vec<std::ffi::OsString> = vec![];
let args = rga::args::parse_args(empty)?;
let mut arg_arr: Vec<std::ffi::OsString> = std::env::args_os().collect();
let last = arg_arr.pop().expect("No filename specified");
let args = rga::args::parse_args(arg_arr)?;
//clap::App::new("rga-preproc").arg(Arg::from_usage())
let path = {
let filepath = std::env::args_os()
.skip(1)
.next()
.ok_or(format_err!("No filename specified"))?;
let filepath = last;
eprintln!("inp fname: {:?}", filepath);
std::env::current_dir()?.join(&filepath)
};

View File

@ -11,11 +11,12 @@ fn main() -> Fallible<()> {
env_logger::init();
let (args, passthrough_args) = split_args()?;
let adapters = get_adapters_filtered(&args.adapters)?;
if args.list_adapters {
let (enabled_adapters, disabled_adapters) = get_all_adapters();
println!("Adapters:\n");
for adapter in adapters {
let print = |adapter: std::rc::Rc<dyn FileAdapter>| {
let meta = adapter.metadata();
let matchers = meta
.fast_matchers
@ -30,9 +31,17 @@ fn main() -> Fallible<()> {
meta.name, meta.description, matchers
);
println!("");
};
for adapter in enabled_adapters {
print(adapter)
}
println!("The following adapters are disabled by default, and can be enabled using '--rga-adapters=+tesseract,xyz':\n");
for adapter in disabled_adapters {
print(adapter)
}
return Ok(());
}
let adapters = get_adapters_filtered(&args.adapters)?;
let pre_glob = if !args.accurate {
let extensions = adapters