mirror of
https://github.com/FliegendeWurst/ripgrep-all.git
synced 2024-11-24 12:24:56 +00:00
add tesseract adapter
This commit is contained in:
parent
d1b55e80b7
commit
1e9c2e45d6
25
Cargo.lock
generated
25
Cargo.lock
generated
@ -902,6 +902,14 @@ dependencies = [
|
|||||||
"ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
"ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "remove_dir_all"
|
||||||
|
version = "0.5.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ripgrep_all"
|
name = "ripgrep_all"
|
||||||
version = "0.5.1"
|
version = "0.5.1"
|
||||||
@ -917,6 +925,7 @@ dependencies = [
|
|||||||
"env_logger 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"env_logger 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"failure 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
"failure 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"flate2 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)",
|
"flate2 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"paste 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
"paste 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
@ -929,6 +938,7 @@ dependencies = [
|
|||||||
"size_format 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"size_format 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"structopt 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
|
"structopt 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"tar 0.4.26 (registry+https://github.com/rust-lang/crates.io-index)",
|
"tar 0.4.26 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"tempfile 3.0.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"tree_magic_fork 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"tree_magic_fork 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"xz2 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
"xz2 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"zip 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"zip 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
@ -1109,6 +1119,19 @@ dependencies = [
|
|||||||
"xattr 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"xattr 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tempfile"
|
||||||
|
version = "3.0.8"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"libc 0.2.58 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"redox_syscall 0.1.54 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "termcolor"
|
name = "termcolor"
|
||||||
version = "1.0.5"
|
version = "1.0.5"
|
||||||
@ -1439,6 +1462,7 @@ dependencies = [
|
|||||||
"checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76"
|
"checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76"
|
||||||
"checksum regex 1.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "8f0a0bcab2fd7d1d7c54fa9eae6f43eddeb9ce2e7352f8518a814a4f65d60c58"
|
"checksum regex 1.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "8f0a0bcab2fd7d1d7c54fa9eae6f43eddeb9ce2e7352f8518a814a4f65d60c58"
|
||||||
"checksum regex-syntax 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)" = "dcfd8681eebe297b81d98498869d4aae052137651ad7b96822f09ceb690d0a96"
|
"checksum regex-syntax 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)" = "dcfd8681eebe297b81d98498869d4aae052137651ad7b96822f09ceb690d0a96"
|
||||||
|
"checksum remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3488ba1b9a2084d38645c4c08276a1752dcbf2c7130d74f1569681ad5d2799c5"
|
||||||
"checksum rkv 0.9.6 (registry+https://github.com/rust-lang/crates.io-index)" = "9ebeb7e046283b72b4bcd3b8ee4720cf69cd09f5b140b5ab46495df4af0e5113"
|
"checksum rkv 0.9.6 (registry+https://github.com/rust-lang/crates.io-index)" = "9ebeb7e046283b72b4bcd3b8ee4720cf69cd09f5b140b5ab46495df4af0e5113"
|
||||||
"checksum rusqlite 0.18.0 (registry+https://github.com/rust-lang/crates.io-index)" = "700720c977deb8b91c9d881dcbe3309c254d414078ca3856ea6647e569be3b66"
|
"checksum rusqlite 0.18.0 (registry+https://github.com/rust-lang/crates.io-index)" = "700720c977deb8b91c9d881dcbe3309c254d414078ca3856ea6647e569be3b66"
|
||||||
"checksum rustc-demangle 0.1.15 (registry+https://github.com/rust-lang/crates.io-index)" = "a7f4dccf6f4891ebcc0c39f9b6eb1a83b9bf5d747cb439ec6fba4f3b977038af"
|
"checksum rustc-demangle 0.1.15 (registry+https://github.com/rust-lang/crates.io-index)" = "a7f4dccf6f4891ebcc0c39f9b6eb1a83b9bf5d747cb439ec6fba4f3b977038af"
|
||||||
@ -1459,6 +1483,7 @@ dependencies = [
|
|||||||
"checksum syn 0.15.34 (registry+https://github.com/rust-lang/crates.io-index)" = "a1393e4a97a19c01e900df2aec855a29f71cf02c402e2f443b8d2747c25c5dbe"
|
"checksum syn 0.15.34 (registry+https://github.com/rust-lang/crates.io-index)" = "a1393e4a97a19c01e900df2aec855a29f71cf02c402e2f443b8d2747c25c5dbe"
|
||||||
"checksum synstructure 0.10.2 (registry+https://github.com/rust-lang/crates.io-index)" = "02353edf96d6e4dc81aea2d8490a7e9db177bf8acb0e951c24940bf866cb313f"
|
"checksum synstructure 0.10.2 (registry+https://github.com/rust-lang/crates.io-index)" = "02353edf96d6e4dc81aea2d8490a7e9db177bf8acb0e951c24940bf866cb313f"
|
||||||
"checksum tar 0.4.26 (registry+https://github.com/rust-lang/crates.io-index)" = "b3196bfbffbba3e57481b6ea32249fbaf590396a52505a2615adbb79d9d826d3"
|
"checksum tar 0.4.26 (registry+https://github.com/rust-lang/crates.io-index)" = "b3196bfbffbba3e57481b6ea32249fbaf590396a52505a2615adbb79d9d826d3"
|
||||||
|
"checksum tempfile 3.0.8 (registry+https://github.com/rust-lang/crates.io-index)" = "7dc4738f2e68ed2855de5ac9cdbe05c9216773ecde4739b2f095002ab03a13ef"
|
||||||
"checksum termcolor 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "96d6098003bde162e4277c70665bd87c326f5a0c3f3fbfb285787fa482d54e6e"
|
"checksum termcolor 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "96d6098003bde162e4277c70665bd87c326f5a0c3f3fbfb285787fa482d54e6e"
|
||||||
"checksum termion 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "dde0593aeb8d47accea5392b39350015b5eccb12c0d98044d856983d89548dea"
|
"checksum termion 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "dde0593aeb8d47accea5392b39350015b5eccb12c0d98044d856983d89548dea"
|
||||||
"checksum textwrap 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
|
"checksum textwrap 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
|
||||||
|
@ -41,3 +41,5 @@ rusqlite = { version = "0.18.0", features=["vtab"] } # "bundled"
|
|||||||
size_format = "1.0.2"
|
size_format = "1.0.2"
|
||||||
structopt = "0.2.16"
|
structopt = "0.2.16"
|
||||||
paste = "0.1.5"
|
paste = "0.1.5"
|
||||||
|
tempfile = "*"
|
||||||
|
glob = "*"
|
@ -18,7 +18,7 @@ similar:
|
|||||||
To enable debug logging:
|
To enable debug logging:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export RUST_LOG=rga=debug
|
export RUST_LOG=debug
|
||||||
export RUST_BACKTRACE=1
|
export RUST_BACKTRACE=1
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
pub mod ffmpeg;
|
pub mod ffmpeg;
|
||||||
pub mod pandoc;
|
pub mod pandoc;
|
||||||
|
pub mod pdfpages;
|
||||||
pub mod poppler;
|
pub mod poppler;
|
||||||
pub mod spawning;
|
pub mod spawning;
|
||||||
pub mod sqlite;
|
pub mod sqlite;
|
||||||
pub mod tar;
|
pub mod tar;
|
||||||
|
pub mod tesseract;
|
||||||
pub mod zip;
|
pub mod zip;
|
||||||
use crate::matching::*;
|
use crate::matching::*;
|
||||||
use crate::preproc::PreprocConfig;
|
use crate::preproc::PreprocConfig;
|
||||||
@ -69,18 +71,19 @@ pub struct AdaptInfo<'a> {
|
|||||||
pub config: PreprocConfig<'a>,
|
pub config: PreprocConfig<'a>,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_adapters() -> (Vec<Rc<dyn FileAdapter>>, Vec<Rc<dyn FileAdapter>>) {
|
pub fn get_all_adapters() -> (Vec<Rc<dyn FileAdapter>>, Vec<Rc<dyn FileAdapter>>) {
|
||||||
// order in descending priority
|
// order in descending priority
|
||||||
let enabled_adapters: Vec<Rc<dyn FileAdapter>> = vec![
|
let enabled_adapters: Vec<Rc<dyn FileAdapter>> = vec![
|
||||||
Rc::new(ffmpeg::FFmpegAdapter),
|
Rc::new(ffmpeg::FFmpegAdapter::new()),
|
||||||
Rc::new(pandoc::PandocAdapter),
|
Rc::new(pandoc::PandocAdapter::new()),
|
||||||
Rc::new(poppler::PopplerAdapter),
|
Rc::new(poppler::PopplerAdapter::new()),
|
||||||
Rc::new(zip::ZipAdapter),
|
Rc::new(zip::ZipAdapter::new()),
|
||||||
Rc::new(tar::TarAdapter),
|
Rc::new(tar::TarAdapter::new()),
|
||||||
Rc::new(sqlite::SqliteAdapter),
|
Rc::new(sqlite::SqliteAdapter::new()),
|
||||||
];
|
];
|
||||||
let disabled_adapters: Vec<Rc<dyn FileAdapter>> = vec![
|
let disabled_adapters: Vec<Rc<dyn FileAdapter>> = vec![
|
||||||
//Rc::new()
|
Rc::new(pdfpages::PdfPagesAdapter::new()),
|
||||||
|
Rc::new(tesseract::TesseractAdapter::new()),
|
||||||
];
|
];
|
||||||
(enabled_adapters, disabled_adapters)
|
(enabled_adapters, disabled_adapters)
|
||||||
}
|
}
|
||||||
@ -89,13 +92,14 @@ pub fn get_adapters() -> (Vec<Rc<dyn FileAdapter>>, Vec<Rc<dyn FileAdapter>>) {
|
|||||||
* filter adapters by given names:
|
* filter adapters by given names:
|
||||||
*
|
*
|
||||||
* - "" means use default enabled adapter list
|
* - "" means use default enabled adapter list
|
||||||
|
* - "a,b" means use adapters a,b
|
||||||
* - "-a,b" means use default list except for a and b
|
* - "-a,b" means use default list except for a and b
|
||||||
* - "+a,b" means use default list but also a and b
|
* - "+a,b" means use default list but also a and b
|
||||||
*/
|
*/
|
||||||
pub fn get_adapters_filtered<T: AsRef<str>>(
|
pub fn get_adapters_filtered<T: AsRef<str>>(
|
||||||
adapter_names: &[T],
|
adapter_names: &[T],
|
||||||
) -> Fallible<Vec<Rc<dyn FileAdapter>>> {
|
) -> Fallible<Vec<Rc<dyn FileAdapter>>> {
|
||||||
let (def_enabled_adapters, def_disabled_adapters) = get_adapters();
|
let (def_enabled_adapters, def_disabled_adapters) = get_all_adapters();
|
||||||
let adapters = if !adapter_names.is_empty() {
|
let adapters = if !adapter_names.is_empty() {
|
||||||
let adapters_map: HashMap<_, _> = def_enabled_adapters
|
let adapters_map: HashMap<_, _> = def_enabled_adapters
|
||||||
.iter()
|
.iter()
|
||||||
@ -104,6 +108,7 @@ pub fn get_adapters_filtered<T: AsRef<str>>(
|
|||||||
.collect();
|
.collect();
|
||||||
let mut adapters = vec![];
|
let mut adapters = vec![];
|
||||||
let mut subtractive = false;
|
let mut subtractive = false;
|
||||||
|
let mut additive = false;
|
||||||
for (i, name) in adapter_names.iter().enumerate() {
|
for (i, name) in adapter_names.iter().enumerate() {
|
||||||
let mut name = name.as_ref();
|
let mut name = name.as_ref();
|
||||||
if i == 0 && (name.starts_with('-')) {
|
if i == 0 && (name.starts_with('-')) {
|
||||||
@ -113,6 +118,7 @@ pub fn get_adapters_filtered<T: AsRef<str>>(
|
|||||||
} else if i == 0 && (name.starts_with('+')) {
|
} else if i == 0 && (name.starts_with('+')) {
|
||||||
name = &name[1..];
|
name = &name[1..];
|
||||||
adapters = def_enabled_adapters.clone();
|
adapters = def_enabled_adapters.clone();
|
||||||
|
additive = true;
|
||||||
}
|
}
|
||||||
if subtractive {
|
if subtractive {
|
||||||
let inx = adapters
|
let inx = adapters
|
||||||
@ -121,12 +127,15 @@ pub fn get_adapters_filtered<T: AsRef<str>>(
|
|||||||
.ok_or_else(|| format_err!("Could not remove {}: Not in list", name))?;
|
.ok_or_else(|| format_err!("Could not remove {}: Not in list", name))?;
|
||||||
adapters.remove(inx);
|
adapters.remove(inx);
|
||||||
} else {
|
} else {
|
||||||
adapters.push(
|
let adapter = adapters_map
|
||||||
adapters_map
|
|
||||||
.get(name)
|
.get(name)
|
||||||
.ok_or_else(|| format_err!("Unknown adapter: \"{}\"", name))?
|
.ok_or_else(|| format_err!("Unknown adapter: \"{}\"", name))?
|
||||||
.clone(),
|
.clone();
|
||||||
);
|
if additive {
|
||||||
|
adapters.insert(0, adapter);
|
||||||
|
} else {
|
||||||
|
adapters.push(adapter);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
adapters
|
adapters
|
||||||
|
143
src/adapters/pdfpages.rs
Normal file
143
src/adapters/pdfpages.rs
Normal file
@ -0,0 +1,143 @@
|
|||||||
|
use super::*;
|
||||||
|
use crate::adapters::spawning::map_exe_error;
|
||||||
|
use crate::adapters::spawning::pipe_output;
|
||||||
|
use crate::preproc::rga_preproc;
|
||||||
|
use lazy_static::lazy_static;
|
||||||
|
use spawning::SpawningFileAdapter;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::BufReader;
|
||||||
|
use std::io::Cursor;
|
||||||
|
use std::io::Take;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use std::process::Command;
|
||||||
|
use std::process::Stdio;
|
||||||
|
|
||||||
|
static EXTENSIONS: &[&str] = &["pdf"];
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref METADATA: AdapterMeta = AdapterMeta {
|
||||||
|
name: "pdfpages".to_owned(),
|
||||||
|
version: 1,
|
||||||
|
description: "Converts a pdf to it's individual pages as png files".to_owned(),
|
||||||
|
fast_matchers: EXTENSIONS
|
||||||
|
.iter()
|
||||||
|
.map(|s| FastMatcher::FileExtension(s.to_string()))
|
||||||
|
.collect(),
|
||||||
|
slow_matchers: None
|
||||||
|
};
|
||||||
|
}
|
||||||
|
#[derive(Default)]
|
||||||
|
pub struct PdfPagesAdapter {}
|
||||||
|
|
||||||
|
impl PdfPagesAdapter {
|
||||||
|
pub fn new() -> PdfPagesAdapter {
|
||||||
|
PdfPagesAdapter {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl GetMetadata for PdfPagesAdapter {
|
||||||
|
fn metadata(&self) -> &AdapterMeta {
|
||||||
|
&METADATA
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*// todo: do this in an actually streaming fashion and less slow
|
||||||
|
// IEND chunk + PDF magic
|
||||||
|
// 4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a
|
||||||
|
let split_seq = hex_literal::hex!("4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a");
|
||||||
|
let split_seq_inx = 8;
|
||||||
|
fn split_by_seq<'a>(
|
||||||
|
split_seq: &'a [u8],
|
||||||
|
split_inx: usize,
|
||||||
|
read: &mut Read,
|
||||||
|
) -> Fallible<impl IntoIterator<Item = impl Read> + 'a> {
|
||||||
|
let regex = split_seq
|
||||||
|
.iter()
|
||||||
|
.map(|c| format!("\\x{:0>2x}", c))
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join("");
|
||||||
|
let restr = format!("(?-u){}", regex);
|
||||||
|
eprintln!("re: {}", restr);
|
||||||
|
let re = regex::bytes::Regex::new(&restr)?;
|
||||||
|
|
||||||
|
let mut all = Vec::new();
|
||||||
|
read.read_to_end(&mut all)?;
|
||||||
|
let mut out: Vec<Cursor<Vec<u8>>> = Vec::new();
|
||||||
|
let mut last = 0;
|
||||||
|
for (i, split) in re.find_iter(&all).enumerate() {
|
||||||
|
let pos = split.start() + split_inx;
|
||||||
|
out.push(Cursor::new(Vec::from(&all[last..pos])));
|
||||||
|
last = pos;
|
||||||
|
}
|
||||||
|
out.push(Cursor::new(Vec::from(&all[last..])));
|
||||||
|
Ok(out)
|
||||||
|
}*/
|
||||||
|
|
||||||
|
impl FileAdapter for PdfPagesAdapter {
|
||||||
|
fn adapt(&self, ai: AdaptInfo) -> Fallible<()> {
|
||||||
|
let AdaptInfo {
|
||||||
|
filepath_hint,
|
||||||
|
is_real_file,
|
||||||
|
mut inp,
|
||||||
|
oup,
|
||||||
|
line_prefix,
|
||||||
|
archive_recursion_depth,
|
||||||
|
config,
|
||||||
|
..
|
||||||
|
} = ai;
|
||||||
|
if !is_real_file {
|
||||||
|
// todo: read to memory and then use that blob if size < max
|
||||||
|
writeln!(oup, "{}[rga: skipping pdfpages in archive]", line_prefix,)?;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
let inp_fname = filepath_hint;
|
||||||
|
let exe_name = "gm";
|
||||||
|
let out_dir = tempfile::Builder::new().prefix("pdfpages-").tempdir()?;
|
||||||
|
let out_fname = out_dir.path().join("out%04d.png");
|
||||||
|
eprintln!("writing to temp dir: {}", out_fname.display());
|
||||||
|
let mut cmd = Command::new(exe_name);
|
||||||
|
cmd.arg("convert")
|
||||||
|
.arg("-density")
|
||||||
|
.arg("300")
|
||||||
|
.arg(inp_fname)
|
||||||
|
.arg("+adjoin")
|
||||||
|
.arg(out_fname);
|
||||||
|
|
||||||
|
let mut cmd = cmd.spawn().map_err(|e| {
|
||||||
|
map_exe_error(
|
||||||
|
e,
|
||||||
|
exe_name,
|
||||||
|
"Could not find gm. Make sure you have graphicsmagick installed.",
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
let args = config.args;
|
||||||
|
// TODO: how to handle this copying better?
|
||||||
|
|
||||||
|
let status = cmd.wait()?;
|
||||||
|
if status.success() {
|
||||||
|
} else {
|
||||||
|
return Err(format_err!("subprocess failed: {:?}", status));
|
||||||
|
}
|
||||||
|
for (i, filename) in glob::glob(
|
||||||
|
out_dir
|
||||||
|
.path()
|
||||||
|
.join("out*.png")
|
||||||
|
.to_str()
|
||||||
|
.expect("temp path has invalid encoding"),
|
||||||
|
)?
|
||||||
|
.enumerate()
|
||||||
|
{
|
||||||
|
let mut ele = BufReader::new(File::open(filename?)?);
|
||||||
|
rga_preproc(AdaptInfo {
|
||||||
|
filepath_hint: &PathBuf::from(format!("Page {}.png", i + 1)),
|
||||||
|
is_real_file: false,
|
||||||
|
inp: &mut ele,
|
||||||
|
oup,
|
||||||
|
line_prefix,
|
||||||
|
archive_recursion_depth: archive_recursion_depth + 1,
|
||||||
|
config: PreprocConfig { cache: None, args },
|
||||||
|
})?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
@ -97,7 +97,8 @@ impl FileAdapter for SqliteAdapter {
|
|||||||
while let Some(row) = z.next()? {
|
while let Some(row) = z.next()? {
|
||||||
writeln!(
|
writeln!(
|
||||||
oup,
|
oup,
|
||||||
"{}: {}",
|
"{}{}: {}",
|
||||||
|
line_prefix,
|
||||||
table,
|
table,
|
||||||
col_names
|
col_names
|
||||||
.iter()
|
.iter()
|
||||||
|
42
src/adapters/tesseract.rs
Normal file
42
src/adapters/tesseract.rs
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
use super::*;
|
||||||
|
use lazy_static::lazy_static;
|
||||||
|
use spawning::SpawningFileAdapter;
|
||||||
|
use std::process::Command;
|
||||||
|
|
||||||
|
static EXTENSIONS: &[&str] = &["jpg", "png"];
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref METADATA: AdapterMeta = AdapterMeta {
|
||||||
|
name: "tesseract".to_owned(),
|
||||||
|
version: 1,
|
||||||
|
description: "Uses tesseract to run OCR on images to make them searchable. May need -j1 to prevent overloading the system. Make sure you have tesseract installed.".to_owned(),
|
||||||
|
fast_matchers: EXTENSIONS
|
||||||
|
.iter()
|
||||||
|
.map(|s| FastMatcher::FileExtension(s.to_string()))
|
||||||
|
.collect(),
|
||||||
|
slow_matchers: None
|
||||||
|
};
|
||||||
|
}
|
||||||
|
#[derive(Default)]
|
||||||
|
pub struct TesseractAdapter {}
|
||||||
|
|
||||||
|
impl TesseractAdapter {
|
||||||
|
pub fn new() -> TesseractAdapter {
|
||||||
|
TesseractAdapter {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl GetMetadata for TesseractAdapter {
|
||||||
|
fn metadata(&self) -> &AdapterMeta {
|
||||||
|
&METADATA
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl SpawningFileAdapter for TesseractAdapter {
|
||||||
|
fn get_exe(&self) -> &str {
|
||||||
|
"tesseract"
|
||||||
|
}
|
||||||
|
fn command(&self, _filepath_hint: &Path, mut cmd: Command) -> Command {
|
||||||
|
cmd.arg("-").arg("-");
|
||||||
|
cmd
|
||||||
|
}
|
||||||
|
}
|
@ -110,7 +110,10 @@ where
|
|||||||
{
|
{
|
||||||
match std::env::var(RGA_CONFIG) {
|
match std::env::var(RGA_CONFIG) {
|
||||||
Ok(val) => {
|
Ok(val) => {
|
||||||
debug!("Loading args from env {}={}", RGA_CONFIG, val);
|
debug!(
|
||||||
|
"Loading args from env {}={}, ignoring cmd args",
|
||||||
|
RGA_CONFIG, val
|
||||||
|
);
|
||||||
Ok(serde_json::from_str(&val)?)
|
Ok(serde_json::from_str(&val)?)
|
||||||
}
|
}
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
|
@ -7,14 +7,12 @@ use std::fs::File;
|
|||||||
|
|
||||||
fn main() -> Fallible<()> {
|
fn main() -> Fallible<()> {
|
||||||
env_logger::init();
|
env_logger::init();
|
||||||
let empty: Vec<std::ffi::OsString> = vec![];
|
let mut arg_arr: Vec<std::ffi::OsString> = std::env::args_os().collect();
|
||||||
let args = rga::args::parse_args(empty)?;
|
let last = arg_arr.pop().expect("No filename specified");
|
||||||
|
let args = rga::args::parse_args(arg_arr)?;
|
||||||
//clap::App::new("rga-preproc").arg(Arg::from_usage())
|
//clap::App::new("rga-preproc").arg(Arg::from_usage())
|
||||||
let path = {
|
let path = {
|
||||||
let filepath = std::env::args_os()
|
let filepath = last;
|
||||||
.skip(1)
|
|
||||||
.next()
|
|
||||||
.ok_or(format_err!("No filename specified"))?;
|
|
||||||
eprintln!("inp fname: {:?}", filepath);
|
eprintln!("inp fname: {:?}", filepath);
|
||||||
std::env::current_dir()?.join(&filepath)
|
std::env::current_dir()?.join(&filepath)
|
||||||
};
|
};
|
||||||
|
@ -11,11 +11,12 @@ fn main() -> Fallible<()> {
|
|||||||
env_logger::init();
|
env_logger::init();
|
||||||
|
|
||||||
let (args, passthrough_args) = split_args()?;
|
let (args, passthrough_args) = split_args()?;
|
||||||
let adapters = get_adapters_filtered(&args.adapters)?;
|
|
||||||
|
|
||||||
if args.list_adapters {
|
if args.list_adapters {
|
||||||
|
let (enabled_adapters, disabled_adapters) = get_all_adapters();
|
||||||
|
|
||||||
println!("Adapters:\n");
|
println!("Adapters:\n");
|
||||||
for adapter in adapters {
|
let print = |adapter: std::rc::Rc<dyn FileAdapter>| {
|
||||||
let meta = adapter.metadata();
|
let meta = adapter.metadata();
|
||||||
let matchers = meta
|
let matchers = meta
|
||||||
.fast_matchers
|
.fast_matchers
|
||||||
@ -30,9 +31,17 @@ fn main() -> Fallible<()> {
|
|||||||
meta.name, meta.description, matchers
|
meta.name, meta.description, matchers
|
||||||
);
|
);
|
||||||
println!("");
|
println!("");
|
||||||
|
};
|
||||||
|
for adapter in enabled_adapters {
|
||||||
|
print(adapter)
|
||||||
|
}
|
||||||
|
println!("The following adapters are disabled by default, and can be enabled using '--rga-adapters=+tesseract,xyz':\n");
|
||||||
|
for adapter in disabled_adapters {
|
||||||
|
print(adapter)
|
||||||
}
|
}
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
let adapters = get_adapters_filtered(&args.adapters)?;
|
||||||
|
|
||||||
let pre_glob = if !args.accurate {
|
let pre_glob = if !args.accurate {
|
||||||
let extensions = adapters
|
let extensions = adapters
|
||||||
|
Loading…
Reference in New Issue
Block a user