mirror of
https://github.com/FliegendeWurst/ripgrep-all.git
synced 2024-11-24 12:24:56 +00:00
add tesseract adapter
This commit is contained in:
parent
d1b55e80b7
commit
1e9c2e45d6
25
Cargo.lock
generated
25
Cargo.lock
generated
@ -902,6 +902,14 @@ dependencies = [
|
||||
"ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "remove_dir_all"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ripgrep_all"
|
||||
version = "0.5.1"
|
||||
@ -917,6 +925,7 @@ dependencies = [
|
||||
"env_logger 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"failure 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"flate2 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"paste 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
@ -929,6 +938,7 @@ dependencies = [
|
||||
"size_format 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"structopt 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"tar 0.4.26 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"tempfile 3.0.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"tree_magic_fork 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"xz2 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"zip 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
@ -1109,6 +1119,19 @@ dependencies = [
|
||||
"xattr 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tempfile"
|
||||
version = "3.0.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"libc 0.2.58 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"redox_syscall 0.1.54 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "termcolor"
|
||||
version = "1.0.5"
|
||||
@ -1439,6 +1462,7 @@ dependencies = [
|
||||
"checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76"
|
||||
"checksum regex 1.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "8f0a0bcab2fd7d1d7c54fa9eae6f43eddeb9ce2e7352f8518a814a4f65d60c58"
|
||||
"checksum regex-syntax 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)" = "dcfd8681eebe297b81d98498869d4aae052137651ad7b96822f09ceb690d0a96"
|
||||
"checksum remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3488ba1b9a2084d38645c4c08276a1752dcbf2c7130d74f1569681ad5d2799c5"
|
||||
"checksum rkv 0.9.6 (registry+https://github.com/rust-lang/crates.io-index)" = "9ebeb7e046283b72b4bcd3b8ee4720cf69cd09f5b140b5ab46495df4af0e5113"
|
||||
"checksum rusqlite 0.18.0 (registry+https://github.com/rust-lang/crates.io-index)" = "700720c977deb8b91c9d881dcbe3309c254d414078ca3856ea6647e569be3b66"
|
||||
"checksum rustc-demangle 0.1.15 (registry+https://github.com/rust-lang/crates.io-index)" = "a7f4dccf6f4891ebcc0c39f9b6eb1a83b9bf5d747cb439ec6fba4f3b977038af"
|
||||
@ -1459,6 +1483,7 @@ dependencies = [
|
||||
"checksum syn 0.15.34 (registry+https://github.com/rust-lang/crates.io-index)" = "a1393e4a97a19c01e900df2aec855a29f71cf02c402e2f443b8d2747c25c5dbe"
|
||||
"checksum synstructure 0.10.2 (registry+https://github.com/rust-lang/crates.io-index)" = "02353edf96d6e4dc81aea2d8490a7e9db177bf8acb0e951c24940bf866cb313f"
|
||||
"checksum tar 0.4.26 (registry+https://github.com/rust-lang/crates.io-index)" = "b3196bfbffbba3e57481b6ea32249fbaf590396a52505a2615adbb79d9d826d3"
|
||||
"checksum tempfile 3.0.8 (registry+https://github.com/rust-lang/crates.io-index)" = "7dc4738f2e68ed2855de5ac9cdbe05c9216773ecde4739b2f095002ab03a13ef"
|
||||
"checksum termcolor 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "96d6098003bde162e4277c70665bd87c326f5a0c3f3fbfb285787fa482d54e6e"
|
||||
"checksum termion 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "dde0593aeb8d47accea5392b39350015b5eccb12c0d98044d856983d89548dea"
|
||||
"checksum textwrap 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
|
||||
|
@ -41,3 +41,5 @@ rusqlite = { version = "0.18.0", features=["vtab"] } # "bundled"
|
||||
size_format = "1.0.2"
|
||||
structopt = "0.2.16"
|
||||
paste = "0.1.5"
|
||||
tempfile = "*"
|
||||
glob = "*"
|
@ -18,7 +18,7 @@ similar:
|
||||
To enable debug logging:
|
||||
|
||||
```bash
|
||||
export RUST_LOG=rga=debug
|
||||
export RUST_LOG=debug
|
||||
export RUST_BACKTRACE=1
|
||||
```
|
||||
|
||||
|
@ -1,9 +1,11 @@
|
||||
pub mod ffmpeg;
|
||||
pub mod pandoc;
|
||||
pub mod pdfpages;
|
||||
pub mod poppler;
|
||||
pub mod spawning;
|
||||
pub mod sqlite;
|
||||
pub mod tar;
|
||||
pub mod tesseract;
|
||||
pub mod zip;
|
||||
use crate::matching::*;
|
||||
use crate::preproc::PreprocConfig;
|
||||
@ -69,18 +71,19 @@ pub struct AdaptInfo<'a> {
|
||||
pub config: PreprocConfig<'a>,
|
||||
}
|
||||
|
||||
pub fn get_adapters() -> (Vec<Rc<dyn FileAdapter>>, Vec<Rc<dyn FileAdapter>>) {
|
||||
pub fn get_all_adapters() -> (Vec<Rc<dyn FileAdapter>>, Vec<Rc<dyn FileAdapter>>) {
|
||||
// order in descending priority
|
||||
let enabled_adapters: Vec<Rc<dyn FileAdapter>> = vec![
|
||||
Rc::new(ffmpeg::FFmpegAdapter),
|
||||
Rc::new(pandoc::PandocAdapter),
|
||||
Rc::new(poppler::PopplerAdapter),
|
||||
Rc::new(zip::ZipAdapter),
|
||||
Rc::new(tar::TarAdapter),
|
||||
Rc::new(sqlite::SqliteAdapter),
|
||||
Rc::new(ffmpeg::FFmpegAdapter::new()),
|
||||
Rc::new(pandoc::PandocAdapter::new()),
|
||||
Rc::new(poppler::PopplerAdapter::new()),
|
||||
Rc::new(zip::ZipAdapter::new()),
|
||||
Rc::new(tar::TarAdapter::new()),
|
||||
Rc::new(sqlite::SqliteAdapter::new()),
|
||||
];
|
||||
let disabled_adapters: Vec<Rc<dyn FileAdapter>> = vec![
|
||||
//Rc::new()
|
||||
Rc::new(pdfpages::PdfPagesAdapter::new()),
|
||||
Rc::new(tesseract::TesseractAdapter::new()),
|
||||
];
|
||||
(enabled_adapters, disabled_adapters)
|
||||
}
|
||||
@ -89,13 +92,14 @@ pub fn get_adapters() -> (Vec<Rc<dyn FileAdapter>>, Vec<Rc<dyn FileAdapter>>) {
|
||||
* filter adapters by given names:
|
||||
*
|
||||
* - "" means use default enabled adapter list
|
||||
* - "a,b" means use adapters a,b
|
||||
* - "-a,b" means use default list except for a and b
|
||||
* - "+a,b" means use default list but also a and b
|
||||
*/
|
||||
pub fn get_adapters_filtered<T: AsRef<str>>(
|
||||
adapter_names: &[T],
|
||||
) -> Fallible<Vec<Rc<dyn FileAdapter>>> {
|
||||
let (def_enabled_adapters, def_disabled_adapters) = get_adapters();
|
||||
let (def_enabled_adapters, def_disabled_adapters) = get_all_adapters();
|
||||
let adapters = if !adapter_names.is_empty() {
|
||||
let adapters_map: HashMap<_, _> = def_enabled_adapters
|
||||
.iter()
|
||||
@ -104,6 +108,7 @@ pub fn get_adapters_filtered<T: AsRef<str>>(
|
||||
.collect();
|
||||
let mut adapters = vec![];
|
||||
let mut subtractive = false;
|
||||
let mut additive = false;
|
||||
for (i, name) in adapter_names.iter().enumerate() {
|
||||
let mut name = name.as_ref();
|
||||
if i == 0 && (name.starts_with('-')) {
|
||||
@ -113,6 +118,7 @@ pub fn get_adapters_filtered<T: AsRef<str>>(
|
||||
} else if i == 0 && (name.starts_with('+')) {
|
||||
name = &name[1..];
|
||||
adapters = def_enabled_adapters.clone();
|
||||
additive = true;
|
||||
}
|
||||
if subtractive {
|
||||
let inx = adapters
|
||||
@ -121,12 +127,15 @@ pub fn get_adapters_filtered<T: AsRef<str>>(
|
||||
.ok_or_else(|| format_err!("Could not remove {}: Not in list", name))?;
|
||||
adapters.remove(inx);
|
||||
} else {
|
||||
adapters.push(
|
||||
adapters_map
|
||||
.get(name)
|
||||
.ok_or_else(|| format_err!("Unknown adapter: \"{}\"", name))?
|
||||
.clone(),
|
||||
);
|
||||
let adapter = adapters_map
|
||||
.get(name)
|
||||
.ok_or_else(|| format_err!("Unknown adapter: \"{}\"", name))?
|
||||
.clone();
|
||||
if additive {
|
||||
adapters.insert(0, adapter);
|
||||
} else {
|
||||
adapters.push(adapter);
|
||||
}
|
||||
}
|
||||
}
|
||||
adapters
|
||||
|
143
src/adapters/pdfpages.rs
Normal file
143
src/adapters/pdfpages.rs
Normal file
@ -0,0 +1,143 @@
|
||||
use super::*;
|
||||
use crate::adapters::spawning::map_exe_error;
|
||||
use crate::adapters::spawning::pipe_output;
|
||||
use crate::preproc::rga_preproc;
|
||||
use lazy_static::lazy_static;
|
||||
use spawning::SpawningFileAdapter;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::io::Cursor;
|
||||
use std::io::Take;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use std::process::Stdio;
|
||||
|
||||
static EXTENSIONS: &[&str] = &["pdf"];
|
||||
|
||||
lazy_static! {
|
||||
static ref METADATA: AdapterMeta = AdapterMeta {
|
||||
name: "pdfpages".to_owned(),
|
||||
version: 1,
|
||||
description: "Converts a pdf to it's individual pages as png files".to_owned(),
|
||||
fast_matchers: EXTENSIONS
|
||||
.iter()
|
||||
.map(|s| FastMatcher::FileExtension(s.to_string()))
|
||||
.collect(),
|
||||
slow_matchers: None
|
||||
};
|
||||
}
|
||||
#[derive(Default)]
|
||||
pub struct PdfPagesAdapter {}
|
||||
|
||||
impl PdfPagesAdapter {
|
||||
pub fn new() -> PdfPagesAdapter {
|
||||
PdfPagesAdapter {}
|
||||
}
|
||||
}
|
||||
|
||||
impl GetMetadata for PdfPagesAdapter {
|
||||
fn metadata(&self) -> &AdapterMeta {
|
||||
&METADATA
|
||||
}
|
||||
}
|
||||
|
||||
/*// todo: do this in an actually streaming fashion and less slow
|
||||
// IEND chunk + PDF magic
|
||||
// 4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a
|
||||
let split_seq = hex_literal::hex!("4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a");
|
||||
let split_seq_inx = 8;
|
||||
fn split_by_seq<'a>(
|
||||
split_seq: &'a [u8],
|
||||
split_inx: usize,
|
||||
read: &mut Read,
|
||||
) -> Fallible<impl IntoIterator<Item = impl Read> + 'a> {
|
||||
let regex = split_seq
|
||||
.iter()
|
||||
.map(|c| format!("\\x{:0>2x}", c))
|
||||
.collect::<Vec<_>>()
|
||||
.join("");
|
||||
let restr = format!("(?-u){}", regex);
|
||||
eprintln!("re: {}", restr);
|
||||
let re = regex::bytes::Regex::new(&restr)?;
|
||||
|
||||
let mut all = Vec::new();
|
||||
read.read_to_end(&mut all)?;
|
||||
let mut out: Vec<Cursor<Vec<u8>>> = Vec::new();
|
||||
let mut last = 0;
|
||||
for (i, split) in re.find_iter(&all).enumerate() {
|
||||
let pos = split.start() + split_inx;
|
||||
out.push(Cursor::new(Vec::from(&all[last..pos])));
|
||||
last = pos;
|
||||
}
|
||||
out.push(Cursor::new(Vec::from(&all[last..])));
|
||||
Ok(out)
|
||||
}*/
|
||||
|
||||
impl FileAdapter for PdfPagesAdapter {
|
||||
fn adapt(&self, ai: AdaptInfo) -> Fallible<()> {
|
||||
let AdaptInfo {
|
||||
filepath_hint,
|
||||
is_real_file,
|
||||
mut inp,
|
||||
oup,
|
||||
line_prefix,
|
||||
archive_recursion_depth,
|
||||
config,
|
||||
..
|
||||
} = ai;
|
||||
if !is_real_file {
|
||||
// todo: read to memory and then use that blob if size < max
|
||||
writeln!(oup, "{}[rga: skipping pdfpages in archive]", line_prefix,)?;
|
||||
return Ok(());
|
||||
}
|
||||
let inp_fname = filepath_hint;
|
||||
let exe_name = "gm";
|
||||
let out_dir = tempfile::Builder::new().prefix("pdfpages-").tempdir()?;
|
||||
let out_fname = out_dir.path().join("out%04d.png");
|
||||
eprintln!("writing to temp dir: {}", out_fname.display());
|
||||
let mut cmd = Command::new(exe_name);
|
||||
cmd.arg("convert")
|
||||
.arg("-density")
|
||||
.arg("300")
|
||||
.arg(inp_fname)
|
||||
.arg("+adjoin")
|
||||
.arg(out_fname);
|
||||
|
||||
let mut cmd = cmd.spawn().map_err(|e| {
|
||||
map_exe_error(
|
||||
e,
|
||||
exe_name,
|
||||
"Could not find gm. Make sure you have graphicsmagick installed.",
|
||||
)
|
||||
})?;
|
||||
let args = config.args;
|
||||
// TODO: how to handle this copying better?
|
||||
|
||||
let status = cmd.wait()?;
|
||||
if status.success() {
|
||||
} else {
|
||||
return Err(format_err!("subprocess failed: {:?}", status));
|
||||
}
|
||||
for (i, filename) in glob::glob(
|
||||
out_dir
|
||||
.path()
|
||||
.join("out*.png")
|
||||
.to_str()
|
||||
.expect("temp path has invalid encoding"),
|
||||
)?
|
||||
.enumerate()
|
||||
{
|
||||
let mut ele = BufReader::new(File::open(filename?)?);
|
||||
rga_preproc(AdaptInfo {
|
||||
filepath_hint: &PathBuf::from(format!("Page {}.png", i + 1)),
|
||||
is_real_file: false,
|
||||
inp: &mut ele,
|
||||
oup,
|
||||
line_prefix,
|
||||
archive_recursion_depth: archive_recursion_depth + 1,
|
||||
config: PreprocConfig { cache: None, args },
|
||||
})?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
@ -97,7 +97,8 @@ impl FileAdapter for SqliteAdapter {
|
||||
while let Some(row) = z.next()? {
|
||||
writeln!(
|
||||
oup,
|
||||
"{}: {}",
|
||||
"{}{}: {}",
|
||||
line_prefix,
|
||||
table,
|
||||
col_names
|
||||
.iter()
|
||||
|
42
src/adapters/tesseract.rs
Normal file
42
src/adapters/tesseract.rs
Normal file
@ -0,0 +1,42 @@
|
||||
use super::*;
|
||||
use lazy_static::lazy_static;
|
||||
use spawning::SpawningFileAdapter;
|
||||
use std::process::Command;
|
||||
|
||||
static EXTENSIONS: &[&str] = &["jpg", "png"];
|
||||
|
||||
lazy_static! {
|
||||
static ref METADATA: AdapterMeta = AdapterMeta {
|
||||
name: "tesseract".to_owned(),
|
||||
version: 1,
|
||||
description: "Uses tesseract to run OCR on images to make them searchable. May need -j1 to prevent overloading the system. Make sure you have tesseract installed.".to_owned(),
|
||||
fast_matchers: EXTENSIONS
|
||||
.iter()
|
||||
.map(|s| FastMatcher::FileExtension(s.to_string()))
|
||||
.collect(),
|
||||
slow_matchers: None
|
||||
};
|
||||
}
|
||||
#[derive(Default)]
|
||||
pub struct TesseractAdapter {}
|
||||
|
||||
impl TesseractAdapter {
|
||||
pub fn new() -> TesseractAdapter {
|
||||
TesseractAdapter {}
|
||||
}
|
||||
}
|
||||
|
||||
impl GetMetadata for TesseractAdapter {
|
||||
fn metadata(&self) -> &AdapterMeta {
|
||||
&METADATA
|
||||
}
|
||||
}
|
||||
impl SpawningFileAdapter for TesseractAdapter {
|
||||
fn get_exe(&self) -> &str {
|
||||
"tesseract"
|
||||
}
|
||||
fn command(&self, _filepath_hint: &Path, mut cmd: Command) -> Command {
|
||||
cmd.arg("-").arg("-");
|
||||
cmd
|
||||
}
|
||||
}
|
@ -110,7 +110,10 @@ where
|
||||
{
|
||||
match std::env::var(RGA_CONFIG) {
|
||||
Ok(val) => {
|
||||
debug!("Loading args from env {}={}", RGA_CONFIG, val);
|
||||
debug!(
|
||||
"Loading args from env {}={}, ignoring cmd args",
|
||||
RGA_CONFIG, val
|
||||
);
|
||||
Ok(serde_json::from_str(&val)?)
|
||||
}
|
||||
Err(_) => {
|
||||
|
@ -7,14 +7,12 @@ use std::fs::File;
|
||||
|
||||
fn main() -> Fallible<()> {
|
||||
env_logger::init();
|
||||
let empty: Vec<std::ffi::OsString> = vec![];
|
||||
let args = rga::args::parse_args(empty)?;
|
||||
let mut arg_arr: Vec<std::ffi::OsString> = std::env::args_os().collect();
|
||||
let last = arg_arr.pop().expect("No filename specified");
|
||||
let args = rga::args::parse_args(arg_arr)?;
|
||||
//clap::App::new("rga-preproc").arg(Arg::from_usage())
|
||||
let path = {
|
||||
let filepath = std::env::args_os()
|
||||
.skip(1)
|
||||
.next()
|
||||
.ok_or(format_err!("No filename specified"))?;
|
||||
let filepath = last;
|
||||
eprintln!("inp fname: {:?}", filepath);
|
||||
std::env::current_dir()?.join(&filepath)
|
||||
};
|
||||
|
@ -11,11 +11,12 @@ fn main() -> Fallible<()> {
|
||||
env_logger::init();
|
||||
|
||||
let (args, passthrough_args) = split_args()?;
|
||||
let adapters = get_adapters_filtered(&args.adapters)?;
|
||||
|
||||
if args.list_adapters {
|
||||
let (enabled_adapters, disabled_adapters) = get_all_adapters();
|
||||
|
||||
println!("Adapters:\n");
|
||||
for adapter in adapters {
|
||||
let print = |adapter: std::rc::Rc<dyn FileAdapter>| {
|
||||
let meta = adapter.metadata();
|
||||
let matchers = meta
|
||||
.fast_matchers
|
||||
@ -30,9 +31,17 @@ fn main() -> Fallible<()> {
|
||||
meta.name, meta.description, matchers
|
||||
);
|
||||
println!("");
|
||||
};
|
||||
for adapter in enabled_adapters {
|
||||
print(adapter)
|
||||
}
|
||||
println!("The following adapters are disabled by default, and can be enabled using '--rga-adapters=+tesseract,xyz':\n");
|
||||
for adapter in disabled_adapters {
|
||||
print(adapter)
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
let adapters = get_adapters_filtered(&args.adapters)?;
|
||||
|
||||
let pre_glob = if !args.accurate {
|
||||
let extensions = adapters
|
||||
|
Loading…
Reference in New Issue
Block a user