more documentation

This commit is contained in:
phiresky 2019-06-13 16:26:03 +02:00
parent e0bc939b93
commit 5f2b5e3399
7 changed files with 36 additions and 22 deletions

View File

@ -10,6 +10,7 @@ rga is a line-oriented search tool that allows you to look for a regex in a mult
- I wanted to add a photograph adapter (based on object classification / detection) for fun, based on something . It worked with [YOLO](https://pjreddie.com/darknet/yolo/), but something more useful and state-of-the art [like this](https://github.com/aimagelab/show-control-and-tell) proved very hard to integrate. - I wanted to add a photograph adapter (based on object classification / detection) for fun, based on something . It worked with [YOLO](https://pjreddie.com/darknet/yolo/), but something more useful and state-of-the art [like this](https://github.com/aimagelab/show-control-and-tell) proved very hard to integrate.
- 7z adapter (couldn't find a nice to use Rust library) - 7z adapter (couldn't find a nice to use Rust library)
- allow per-adapter configuration options (probably via env (RGA_ADAPTER_CONF=json)) - allow per-adapter configuration options (probably via env (RGA_ADAPTER_CONF=json))
- there's some more (mostly technical) todos in the code
## Examples ## Examples
@ -40,19 +41,23 @@ On the first run rga is mostly faster because of multithreading, but on subseque
rga should compile with stable Rust. To install it, simply run (your OSes equivalent of) rga should compile with stable Rust. To install it, simply run (your OSes equivalent of)
```bash ```bash
apt install build-essential pandoc poppler-utils apt install build-essential pandoc poppler-utils ffmpeg
cargo install ripgrep_all cargo install ripgrep_all
rga --help # works! :) rga --help # works! :)
``` ```
You don't necessarily need to install any dependencies, but then you will see an error when trying to read from the corresponding file type (e.g. poppler-utils for pdf).
## Technical details ## Technical details
`rga` simply runs ripgrep (`rg`) with some options set, especially `--pre=rga-preproc` and `--pre-glob`. `rga` simply runs ripgrep (`rg`) with some options set, especially `--pre=rga-preproc` and `--pre-glob`.
`rga-preproc [fname]` will match an adapter to the given file based on either it's filename or it's mime type (if `--accurate` is given). `rga-preproc [fname]` will match an "adapter" to the given file based on either it's filename or it's mime type (if `--accurate` is given). You can see all adapters currently included in [src/adapters](src/adapters).
Some rga adapters run external binaries Some rga adapters run external binaries to do the actual work (such as pandoc or ffmpeg), usually by writing to stdin and reading from stdout.
Most adapters read the files from a [Read](https://doc.rust-lang.org/std/io/trait.Read.html), so they work completely on streamed data (that can come from anywhere including within nested archives). rga-preproc writes
## Development ## Development

View File

@ -8,13 +8,14 @@ use std::process::*;
// todo: // todo:
// maybe todo: read list of extensions from // maybe todo: read list of extensions from
// ffmpeg -demuxers | tail -n+5 | awk '{print $2}' | while read demuxer; do echo MUX=$demuxer; ffmpeg -h demuxer=$demuxer | grep 'Common extensions'; done 2>/dev/null // ffmpeg -demuxers | tail -n+5 | awk '{print $2}' | while read demuxer; do echo MUX=$demuxer; ffmpeg -h demuxer=$demuxer | grep 'Common extensions'; done 2>/dev/null
// but really, the probability of getting useful information from a .flv is low
static EXTENSIONS: &[&str] = &["mkv", "mp4", "avi"]; static EXTENSIONS: &[&str] = &["mkv", "mp4", "avi"];
lazy_static! { lazy_static! {
static ref METADATA: AdapterMeta = AdapterMeta { static ref METADATA: AdapterMeta = AdapterMeta {
name: "ffmpeg".to_owned(), name: "ffmpeg".to_owned(),
version: 1, version: 1,
description: "Uses ffmpeg to extract video metadata and subtitles".to_owned(), description: "Uses ffmpeg to extract video metadata/chapters and subtitles".to_owned(),
fast_matchers: EXTENSIONS fast_matchers: EXTENSIONS
.iter() .iter()
.map(|s| FastMatcher::FileExtension(s.to_string())) .map(|s| FastMatcher::FileExtension(s.to_string()))
@ -55,7 +56,10 @@ impl FileAdapter for FFmpegAdapter {
.. ..
} = ai; } = ai;
if !is_real_file { if !is_real_file {
// we *could* probably adapt this to also work based on streams, but really when would you want to search for videos within archives? // we *could* probably adapt this to also work based on streams,
// it would require using a BufReader to read at least part of the file to memory
// but really when would you want to search for videos within archives?
// So instead, we only run this adapter if the file is a actual file on disk for now
writeln!(oup, "{}[rga: skipping video in archive]", line_prefix,)?; writeln!(oup, "{}[rga: skipping video in archive]", line_prefix,)?;
return Ok(()); return Ok(());
} }
@ -125,7 +129,7 @@ impl FileAdapter for FFmpegAdapter {
let stdo = cmd.stdout.as_mut().expect("is piped"); let stdo = cmd.stdout.as_mut().expect("is piped");
let time_re = Regex::new(r".*\d.*-->.*\d.*").unwrap(); let time_re = Regex::new(r".*\d.*-->.*\d.*").unwrap();
let mut time: String = "".to_owned(); let mut time: String = "".to_owned();
// rewrite subtitle times so they are prefixed in every line // rewrite subtitle times so they are shown as a prefix in every line
for line in BufReader::new(stdo).lines() { for line in BufReader::new(stdo).lines() {
let line = line?; let line = line?;
// 09:55.195 --> 09:56.730 // 09:55.195 --> 09:56.730

View File

@ -45,7 +45,9 @@ lazy_static! {
static ref METADATA: AdapterMeta = AdapterMeta { static ref METADATA: AdapterMeta = AdapterMeta {
name: "pandoc".to_owned(), name: "pandoc".to_owned(),
version: 1, version: 1,
description: "Uses pandoc to convert binary/unreadable text documents to plain text markdown-like text".to_owned(), description:
"Uses pandoc to convert binary/unreadable text documents to plain markdown-like text"
.to_owned(),
fast_matchers: EXTENSIONS fast_matchers: EXTENSIONS
.iter() .iter()
.map(|s| FastMatcher::FileExtension(s.to_string())) .map(|s| FastMatcher::FileExtension(s.to_string()))

View File

@ -7,18 +7,16 @@ use lazy_static::lazy_static;
use std::fs::File; use std::fs::File;
use std::io::BufReader; use std::io::BufReader;
use std::path::PathBuf; use std::path::PathBuf;
use std::process::Command; use std::process::Command;
static EXTENSIONS: &[&str] = &["pdf"]; static EXTENSIONS: &[&str] = &["pdf"];
lazy_static! { lazy_static! {
static ref METADATA: AdapterMeta = AdapterMeta { static ref METADATA: AdapterMeta = AdapterMeta {
name: "pdfpages".to_owned(), name: "pdfpages".to_owned(),
version: 1, version: 1,
description: "Converts a pdf to it's individual pages as png files".to_owned(), description: "Converts a pdf to it's individual pages as png files. Only useful in combination with tesseract".to_owned(),
fast_matchers: EXTENSIONS fast_matchers: EXTENSIONS
.iter() .iter()
.map(|s| FastMatcher::FileExtension(s.to_string())) .map(|s| FastMatcher::FileExtension(s.to_string()))
@ -41,12 +39,13 @@ impl GetMetadata for PdfPagesAdapter {
} }
} }
/// A pdf is basically converted to a zip that has Page X.png files.
/// This way, something like tesseract can process the pages individually
impl FileAdapter for PdfPagesAdapter { impl FileAdapter for PdfPagesAdapter {
fn adapt(&self, ai: AdaptInfo) -> Fallible<()> { fn adapt(&self, ai: AdaptInfo) -> Fallible<()> {
let AdaptInfo { let AdaptInfo {
filepath_hint, filepath_hint,
is_real_file, is_real_file,
inp: _,
oup, oup,
line_prefix, line_prefix,
archive_recursion_depth, archive_recursion_depth,
@ -75,7 +74,6 @@ impl FileAdapter for PdfPagesAdapter {
map_exe_error(e, exe_name, "Make sure you have graphicsmagick installed.") map_exe_error(e, exe_name, "Make sure you have graphicsmagick installed.")
})?; })?;
let args = config.args; let args = config.args;
// TODO: how to handle this copying better?
let status = cmd.wait()?; let status = cmd.wait()?;
if status.success() { if status.success() {

View File

@ -34,6 +34,13 @@ impl GetMetadata for PopplerAdapter {
} }
} }
impl SpawningFileAdapter for PopplerAdapter { impl SpawningFileAdapter for PopplerAdapter {
fn get_exe(&self) -> &str {
"pdftotext"
}
fn command(&self, _filepath_hint: &Path, mut cmd: Command) -> Command {
cmd.arg("-layout").arg("-").arg("-");
cmd
}
fn postproc(line_prefix: &str, inp: &mut dyn Read, oup: &mut dyn Write) -> Fallible<()> { fn postproc(line_prefix: &str, inp: &mut dyn Read, oup: &mut dyn Write) -> Fallible<()> {
// prepend Page X to each line // prepend Page X to each line
let mut page = 1; let mut page = 1;
@ -48,11 +55,4 @@ impl SpawningFileAdapter for PopplerAdapter {
} }
Ok(()) Ok(())
} }
fn get_exe(&self) -> &str {
"pdftotext"
}
fn command(&self, _filepath_hint: &Path, mut cmd: Command) -> Command {
cmd.arg("-layout").arg("-").arg("-");
cmd
}
} }

View File

@ -9,14 +9,15 @@ use std::process::Stdio;
* Copy a Read to a Write, while prefixing every line with a prefix. * Copy a Read to a Write, while prefixing every line with a prefix.
* *
* Try to detect binary files and ignore them. Does not ensure any encoding in the output. * Try to detect binary files and ignore them. Does not ensure any encoding in the output.
*
* This is needed because the rg binary detection does not apply to preprocessed files
*/ */
pub fn postproc_line_prefix( pub fn postproc_line_prefix(
line_prefix: &str, line_prefix: &str,
inp: &mut dyn Read, inp: &mut dyn Read,
oup: &mut dyn Write, oup: &mut dyn Write,
) -> Fallible<()> { ) -> Fallible<()> {
//std::io::copy(inp, oup)?; // check for null byte in first 8kB
//return Ok(());
let mut reader = BufReader::with_capacity(1 << 12, inp); let mut reader = BufReader::with_capacity(1 << 12, inp);
let fourk = reader.fill_buf()?; let fourk = reader.fill_buf()?;
if fourk.contains(&0u8) { if fourk.contains(&0u8) {
@ -45,6 +46,8 @@ pub trait SpawningFileAdapter: GetMetadata {
} }
} }
/// replace a Command.spawn() error "File not found" with a more readable error
/// to indicate some program is not installed
pub fn map_exe_error(err: std::io::Error, exe_name: &str, help: &str) -> Error { pub fn map_exe_error(err: std::io::Error, exe_name: &str, help: &str) -> Error {
use std::io::ErrorKind::*; use std::io::ErrorKind::*;
match err.kind() { match err.kind() {
@ -70,6 +73,7 @@ pub fn pipe_output(
let mut stdo = cmd.stdout.take().expect("is piped"); let mut stdo = cmd.stdout.take().expect("is piped");
// TODO: how to handle this copying better? // TODO: how to handle this copying better?
// do we really need threads for this?
crossbeam::scope(|s| -> Fallible<()> { crossbeam::scope(|s| -> Fallible<()> {
s.spawn(|_| cp(line_prefix, &mut stdo, oup).unwrap()); // errors? s.spawn(|_| cp(line_prefix, &mut stdo, oup).unwrap()); // errors?
std::io::copy(inp, &mut stdi)?; std::io::copy(inp, &mut stdi)?;

View File

@ -65,6 +65,7 @@ impl FileAdapter for SqliteAdapter {
.. ..
} = ai; } = ai;
if !is_real_file { if !is_real_file {
// db is in an archive
// todo: read to memory and then use that blob if size < max // todo: read to memory and then use that blob if size < max
writeln!(oup, "{}[rga: skipping sqlite in archive]", line_prefix,)?; writeln!(oup, "{}[rga: skipping sqlite in archive]", line_prefix,)?;
return Ok(()); return Ok(());