diff --git a/README.md b/README.md index 174ca4a..5b554af 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ rga is a line-oriented search tool that allows you to look for a regex in a mult - I wanted to add a photograph adapter (based on object classification / detection) for fun, based on something . It worked with [YOLO](https://pjreddie.com/darknet/yolo/), but something more useful and state-of-the art [like this](https://github.com/aimagelab/show-control-and-tell) proved very hard to integrate. - 7z adapter (couldn't find a nice to use Rust library) - allow per-adapter configuration options (probably via env (RGA_ADAPTER_CONF=json)) +- there's some more (mostly technical) todos in the code ## Examples @@ -40,19 +41,23 @@ On the first run rga is mostly faster because of multithreading, but on subseque rga should compile with stable Rust. To install it, simply run (your OSes equivalent of) ```bash -apt install build-essential pandoc poppler-utils +apt install build-essential pandoc poppler-utils ffmpeg cargo install ripgrep_all rga --help # works! :) ``` +You don't necessarily need to install any dependencies, but then you will see an error when trying to read from the corresponding file type (e.g. poppler-utils for pdf). + ## Technical details `rga` simply runs ripgrep (`rg`) with some options set, especially `--pre=rga-preproc` and `--pre-glob`. -`rga-preproc [fname]` will match an adapter to the given file based on either it's filename or it's mime type (if `--accurate` is given). +`rga-preproc [fname]` will match an "adapter" to the given file based on either it's filename or it's mime type (if `--accurate` is given). You can see all adapters currently included in [src/adapters](src/adapters). -Some rga adapters run external binaries +Some rga adapters run external binaries to do the actual work (such as pandoc or ffmpeg), usually by writing to stdin and reading from stdout. + +Most adapters read the files from a [Read](https://doc.rust-lang.org/std/io/trait.Read.html), so they work completely on streamed data (that can come from anywhere including within nested archives). rga-preproc writes ## Development diff --git a/src/adapters/ffmpeg.rs b/src/adapters/ffmpeg.rs index dd0e4d3..cae3aba 100644 --- a/src/adapters/ffmpeg.rs +++ b/src/adapters/ffmpeg.rs @@ -7,14 +7,15 @@ use std::io::BufReader; use std::process::*; // todo: // maybe todo: read list of extensions from -//ffmpeg -demuxers | tail -n+5 | awk '{print $2}' | while read demuxer; do echo MUX=$demuxer; ffmpeg -h demuxer=$demuxer | grep 'Common extensions'; done 2>/dev/null +// ffmpeg -demuxers | tail -n+5 | awk '{print $2}' | while read demuxer; do echo MUX=$demuxer; ffmpeg -h demuxer=$demuxer | grep 'Common extensions'; done 2>/dev/null +// but really, the probability of getting useful information from a .flv is low static EXTENSIONS: &[&str] = &["mkv", "mp4", "avi"]; lazy_static! { static ref METADATA: AdapterMeta = AdapterMeta { name: "ffmpeg".to_owned(), version: 1, - description: "Uses ffmpeg to extract video metadata and subtitles".to_owned(), + description: "Uses ffmpeg to extract video metadata/chapters and subtitles".to_owned(), fast_matchers: EXTENSIONS .iter() .map(|s| FastMatcher::FileExtension(s.to_string())) @@ -55,7 +56,10 @@ impl FileAdapter for FFmpegAdapter { .. } = ai; if !is_real_file { - // we *could* probably adapt this to also work based on streams, but really when would you want to search for videos within archives? + // we *could* probably adapt this to also work based on streams, + // it would require using a BufReader to read at least part of the file to memory + // but really when would you want to search for videos within archives? + // So instead, we only run this adapter if the file is a actual file on disk for now writeln!(oup, "{}[rga: skipping video in archive]", line_prefix,)?; return Ok(()); } @@ -125,7 +129,7 @@ impl FileAdapter for FFmpegAdapter { let stdo = cmd.stdout.as_mut().expect("is piped"); let time_re = Regex::new(r".*\d.*-->.*\d.*").unwrap(); let mut time: String = "".to_owned(); - // rewrite subtitle times so they are prefixed in every line + // rewrite subtitle times so they are shown as a prefix in every line for line in BufReader::new(stdo).lines() { let line = line?; // 09:55.195 --> 09:56.730 diff --git a/src/adapters/pandoc.rs b/src/adapters/pandoc.rs index eb81b02..8af0075 100644 --- a/src/adapters/pandoc.rs +++ b/src/adapters/pandoc.rs @@ -45,7 +45,9 @@ lazy_static! { static ref METADATA: AdapterMeta = AdapterMeta { name: "pandoc".to_owned(), version: 1, - description: "Uses pandoc to convert binary/unreadable text documents to plain text markdown-like text".to_owned(), + description: + "Uses pandoc to convert binary/unreadable text documents to plain markdown-like text" + .to_owned(), fast_matchers: EXTENSIONS .iter() .map(|s| FastMatcher::FileExtension(s.to_string())) diff --git a/src/adapters/pdfpages.rs b/src/adapters/pdfpages.rs index 52912a9..13d7bbc 100644 --- a/src/adapters/pdfpages.rs +++ b/src/adapters/pdfpages.rs @@ -7,18 +7,16 @@ use lazy_static::lazy_static; use std::fs::File; use std::io::BufReader; - use std::path::PathBuf; use std::process::Command; - static EXTENSIONS: &[&str] = &["pdf"]; lazy_static! { static ref METADATA: AdapterMeta = AdapterMeta { name: "pdfpages".to_owned(), version: 1, - description: "Converts a pdf to it's individual pages as png files".to_owned(), + description: "Converts a pdf to it's individual pages as png files. Only useful in combination with tesseract".to_owned(), fast_matchers: EXTENSIONS .iter() .map(|s| FastMatcher::FileExtension(s.to_string())) @@ -41,12 +39,13 @@ impl GetMetadata for PdfPagesAdapter { } } +/// A pdf is basically converted to a zip that has Page X.png files. +/// This way, something like tesseract can process the pages individually impl FileAdapter for PdfPagesAdapter { fn adapt(&self, ai: AdaptInfo) -> Fallible<()> { let AdaptInfo { filepath_hint, is_real_file, - inp: _, oup, line_prefix, archive_recursion_depth, @@ -75,7 +74,6 @@ impl FileAdapter for PdfPagesAdapter { map_exe_error(e, exe_name, "Make sure you have graphicsmagick installed.") })?; let args = config.args; - // TODO: how to handle this copying better? let status = cmd.wait()?; if status.success() { diff --git a/src/adapters/poppler.rs b/src/adapters/poppler.rs index e7151ae..1dc3424 100644 --- a/src/adapters/poppler.rs +++ b/src/adapters/poppler.rs @@ -34,6 +34,13 @@ impl GetMetadata for PopplerAdapter { } } impl SpawningFileAdapter for PopplerAdapter { + fn get_exe(&self) -> &str { + "pdftotext" + } + fn command(&self, _filepath_hint: &Path, mut cmd: Command) -> Command { + cmd.arg("-layout").arg("-").arg("-"); + cmd + } fn postproc(line_prefix: &str, inp: &mut dyn Read, oup: &mut dyn Write) -> Fallible<()> { // prepend Page X to each line let mut page = 1; @@ -48,11 +55,4 @@ impl SpawningFileAdapter for PopplerAdapter { } Ok(()) } - fn get_exe(&self) -> &str { - "pdftotext" - } - fn command(&self, _filepath_hint: &Path, mut cmd: Command) -> Command { - cmd.arg("-layout").arg("-").arg("-"); - cmd - } } diff --git a/src/adapters/spawning.rs b/src/adapters/spawning.rs index 4cacb5a..726617a 100644 --- a/src/adapters/spawning.rs +++ b/src/adapters/spawning.rs @@ -9,14 +9,15 @@ use std::process::Stdio; * Copy a Read to a Write, while prefixing every line with a prefix. * * Try to detect binary files and ignore them. Does not ensure any encoding in the output. + * + * This is needed because the rg binary detection does not apply to preprocessed files */ pub fn postproc_line_prefix( line_prefix: &str, inp: &mut dyn Read, oup: &mut dyn Write, ) -> Fallible<()> { - //std::io::copy(inp, oup)?; - //return Ok(()); + // check for null byte in first 8kB let mut reader = BufReader::with_capacity(1 << 12, inp); let fourk = reader.fill_buf()?; if fourk.contains(&0u8) { @@ -45,6 +46,8 @@ pub trait SpawningFileAdapter: GetMetadata { } } +/// replace a Command.spawn() error "File not found" with a more readable error +/// to indicate some program is not installed pub fn map_exe_error(err: std::io::Error, exe_name: &str, help: &str) -> Error { use std::io::ErrorKind::*; match err.kind() { @@ -70,6 +73,7 @@ pub fn pipe_output( let mut stdo = cmd.stdout.take().expect("is piped"); // TODO: how to handle this copying better? + // do we really need threads for this? crossbeam::scope(|s| -> Fallible<()> { s.spawn(|_| cp(line_prefix, &mut stdo, oup).unwrap()); // errors? std::io::copy(inp, &mut stdi)?; diff --git a/src/adapters/sqlite.rs b/src/adapters/sqlite.rs index 8b4891e..74cbe0c 100644 --- a/src/adapters/sqlite.rs +++ b/src/adapters/sqlite.rs @@ -65,6 +65,7 @@ impl FileAdapter for SqliteAdapter { .. } = ai; if !is_real_file { + // db is in an archive // todo: read to memory and then use that blob if size < max writeln!(oup, "{}[rga: skipping sqlite in archive]", line_prefix,)?; return Ok(());