tesseract single threaded

This commit is contained in:
phiresky 2019-06-12 17:44:47 +02:00
parent 1e9c2e45d6
commit 1f6e793a7f
3 changed files with 67 additions and 70 deletions

BIN
exampledir/screenshot.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 52 KiB

View File

@ -41,6 +41,71 @@ impl GetMetadata for PdfPagesAdapter {
} }
} }
impl FileAdapter for PdfPagesAdapter {
fn adapt(&self, ai: AdaptInfo) -> Fallible<()> {
let AdaptInfo {
filepath_hint,
is_real_file,
mut inp,
oup,
line_prefix,
archive_recursion_depth,
config,
..
} = ai;
if !is_real_file {
// todo: read to memory and then use that blob if size < max
writeln!(oup, "{}[rga: skipping pdfpages in archive]", line_prefix,)?;
return Ok(());
}
let inp_fname = filepath_hint;
let exe_name = "gm";
let out_dir = tempfile::Builder::new().prefix("pdfpages-").tempdir()?;
let out_fname = out_dir.path().join("out%04d.png");
eprintln!("writing to temp dir: {}", out_fname.display());
let mut cmd = Command::new(exe_name);
cmd.arg("convert")
.arg("-density")
.arg("200")
.arg(inp_fname)
.arg("+adjoin")
.arg(out_fname);
let mut cmd = cmd.spawn().map_err(|e| {
map_exe_error(e, exe_name, "Make sure you have graphicsmagick installed.")
})?;
let args = config.args;
// TODO: how to handle this copying better?
let status = cmd.wait()?;
if status.success() {
} else {
return Err(format_err!("subprocess failed: {:?}", status));
}
for (i, filename) in glob::glob(
out_dir
.path()
.join("out*.png")
.to_str()
.expect("temp path has invalid encoding"),
)?
.enumerate()
{
let mut ele = BufReader::new(File::open(filename?)?);
rga_preproc(AdaptInfo {
filepath_hint: &PathBuf::from(format!("Page {}.png", i + 1)),
is_real_file: false,
inp: &mut ele,
oup,
line_prefix: &format!("{}Page {}:", line_prefix, i + 1),
archive_recursion_depth: archive_recursion_depth + 1,
config: PreprocConfig { cache: None, args },
})?;
}
Ok(())
}
}
/*// todo: do this in an actually streaming fashion and less slow /*// todo: do this in an actually streaming fashion and less slow
// IEND chunk + PDF magic // IEND chunk + PDF magic
// 4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a // 4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a
@ -72,72 +137,3 @@ fn split_by_seq<'a>(
out.push(Cursor::new(Vec::from(&all[last..]))); out.push(Cursor::new(Vec::from(&all[last..])));
Ok(out) Ok(out)
}*/ }*/
impl FileAdapter for PdfPagesAdapter {
fn adapt(&self, ai: AdaptInfo) -> Fallible<()> {
let AdaptInfo {
filepath_hint,
is_real_file,
mut inp,
oup,
line_prefix,
archive_recursion_depth,
config,
..
} = ai;
if !is_real_file {
// todo: read to memory and then use that blob if size < max
writeln!(oup, "{}[rga: skipping pdfpages in archive]", line_prefix,)?;
return Ok(());
}
let inp_fname = filepath_hint;
let exe_name = "gm";
let out_dir = tempfile::Builder::new().prefix("pdfpages-").tempdir()?;
let out_fname = out_dir.path().join("out%04d.png");
eprintln!("writing to temp dir: {}", out_fname.display());
let mut cmd = Command::new(exe_name);
cmd.arg("convert")
.arg("-density")
.arg("300")
.arg(inp_fname)
.arg("+adjoin")
.arg(out_fname);
let mut cmd = cmd.spawn().map_err(|e| {
map_exe_error(
e,
exe_name,
"Could not find gm. Make sure you have graphicsmagick installed.",
)
})?;
let args = config.args;
// TODO: how to handle this copying better?
let status = cmd.wait()?;
if status.success() {
} else {
return Err(format_err!("subprocess failed: {:?}", status));
}
for (i, filename) in glob::glob(
out_dir
.path()
.join("out*.png")
.to_str()
.expect("temp path has invalid encoding"),
)?
.enumerate()
{
let mut ele = BufReader::new(File::open(filename?)?);
rga_preproc(AdaptInfo {
filepath_hint: &PathBuf::from(format!("Page {}.png", i + 1)),
is_real_file: false,
inp: &mut ele,
oup,
line_prefix,
archive_recursion_depth: archive_recursion_depth + 1,
config: PreprocConfig { cache: None, args },
})?;
}
Ok(())
}
}

View File

@ -36,7 +36,8 @@ impl SpawningFileAdapter for TesseractAdapter {
"tesseract" "tesseract"
} }
fn command(&self, _filepath_hint: &Path, mut cmd: Command) -> Command { fn command(&self, _filepath_hint: &Path, mut cmd: Command) -> Command {
cmd.arg("-").arg("-"); // rg already does threading
cmd.env("OMP_THREAD_LIMIT", "1").arg("-").arg("-");
cmd cmd
} }
} }