mirror of
https://github.com/FliegendeWurst/ripgrep-all.git
synced 2024-11-24 12:24:56 +00:00
more documentation
This commit is contained in:
parent
e0bc939b93
commit
5f2b5e3399
11
README.md
11
README.md
@ -10,6 +10,7 @@ rga is a line-oriented search tool that allows you to look for a regex in a mult
|
|||||||
- I wanted to add a photograph adapter (based on object classification / detection) for fun, based on something . It worked with [YOLO](https://pjreddie.com/darknet/yolo/), but something more useful and state-of-the art [like this](https://github.com/aimagelab/show-control-and-tell) proved very hard to integrate.
|
- I wanted to add a photograph adapter (based on object classification / detection) for fun, based on something . It worked with [YOLO](https://pjreddie.com/darknet/yolo/), but something more useful and state-of-the art [like this](https://github.com/aimagelab/show-control-and-tell) proved very hard to integrate.
|
||||||
- 7z adapter (couldn't find a nice to use Rust library)
|
- 7z adapter (couldn't find a nice to use Rust library)
|
||||||
- allow per-adapter configuration options (probably via env (RGA_ADAPTER_CONF=json))
|
- allow per-adapter configuration options (probably via env (RGA_ADAPTER_CONF=json))
|
||||||
|
- there's some more (mostly technical) todos in the code
|
||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
|
|
||||||
@ -40,19 +41,23 @@ On the first run rga is mostly faster because of multithreading, but on subseque
|
|||||||
rga should compile with stable Rust. To install it, simply run (your OSes equivalent of)
|
rga should compile with stable Rust. To install it, simply run (your OSes equivalent of)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
apt install build-essential pandoc poppler-utils
|
apt install build-essential pandoc poppler-utils ffmpeg
|
||||||
cargo install ripgrep_all
|
cargo install ripgrep_all
|
||||||
|
|
||||||
rga --help # works! :)
|
rga --help # works! :)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
You don't necessarily need to install any dependencies, but then you will see an error when trying to read from the corresponding file type (e.g. poppler-utils for pdf).
|
||||||
|
|
||||||
## Technical details
|
## Technical details
|
||||||
|
|
||||||
`rga` simply runs ripgrep (`rg`) with some options set, especially `--pre=rga-preproc` and `--pre-glob`.
|
`rga` simply runs ripgrep (`rg`) with some options set, especially `--pre=rga-preproc` and `--pre-glob`.
|
||||||
|
|
||||||
`rga-preproc [fname]` will match an adapter to the given file based on either it's filename or it's mime type (if `--accurate` is given).
|
`rga-preproc [fname]` will match an "adapter" to the given file based on either it's filename or it's mime type (if `--accurate` is given). You can see all adapters currently included in [src/adapters](src/adapters).
|
||||||
|
|
||||||
Some rga adapters run external binaries
|
Some rga adapters run external binaries to do the actual work (such as pandoc or ffmpeg), usually by writing to stdin and reading from stdout.
|
||||||
|
|
||||||
|
Most adapters read the files from a [Read](https://doc.rust-lang.org/std/io/trait.Read.html), so they work completely on streamed data (that can come from anywhere including within nested archives). rga-preproc writes
|
||||||
|
|
||||||
## Development
|
## Development
|
||||||
|
|
||||||
|
@ -8,13 +8,14 @@ use std::process::*;
|
|||||||
// todo:
|
// todo:
|
||||||
// maybe todo: read list of extensions from
|
// maybe todo: read list of extensions from
|
||||||
// ffmpeg -demuxers | tail -n+5 | awk '{print $2}' | while read demuxer; do echo MUX=$demuxer; ffmpeg -h demuxer=$demuxer | grep 'Common extensions'; done 2>/dev/null
|
// ffmpeg -demuxers | tail -n+5 | awk '{print $2}' | while read demuxer; do echo MUX=$demuxer; ffmpeg -h demuxer=$demuxer | grep 'Common extensions'; done 2>/dev/null
|
||||||
|
// but really, the probability of getting useful information from a .flv is low
|
||||||
static EXTENSIONS: &[&str] = &["mkv", "mp4", "avi"];
|
static EXTENSIONS: &[&str] = &["mkv", "mp4", "avi"];
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
static ref METADATA: AdapterMeta = AdapterMeta {
|
static ref METADATA: AdapterMeta = AdapterMeta {
|
||||||
name: "ffmpeg".to_owned(),
|
name: "ffmpeg".to_owned(),
|
||||||
version: 1,
|
version: 1,
|
||||||
description: "Uses ffmpeg to extract video metadata and subtitles".to_owned(),
|
description: "Uses ffmpeg to extract video metadata/chapters and subtitles".to_owned(),
|
||||||
fast_matchers: EXTENSIONS
|
fast_matchers: EXTENSIONS
|
||||||
.iter()
|
.iter()
|
||||||
.map(|s| FastMatcher::FileExtension(s.to_string()))
|
.map(|s| FastMatcher::FileExtension(s.to_string()))
|
||||||
@ -55,7 +56,10 @@ impl FileAdapter for FFmpegAdapter {
|
|||||||
..
|
..
|
||||||
} = ai;
|
} = ai;
|
||||||
if !is_real_file {
|
if !is_real_file {
|
||||||
// we *could* probably adapt this to also work based on streams, but really when would you want to search for videos within archives?
|
// we *could* probably adapt this to also work based on streams,
|
||||||
|
// it would require using a BufReader to read at least part of the file to memory
|
||||||
|
// but really when would you want to search for videos within archives?
|
||||||
|
// So instead, we only run this adapter if the file is a actual file on disk for now
|
||||||
writeln!(oup, "{}[rga: skipping video in archive]", line_prefix,)?;
|
writeln!(oup, "{}[rga: skipping video in archive]", line_prefix,)?;
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
@ -125,7 +129,7 @@ impl FileAdapter for FFmpegAdapter {
|
|||||||
let stdo = cmd.stdout.as_mut().expect("is piped");
|
let stdo = cmd.stdout.as_mut().expect("is piped");
|
||||||
let time_re = Regex::new(r".*\d.*-->.*\d.*").unwrap();
|
let time_re = Regex::new(r".*\d.*-->.*\d.*").unwrap();
|
||||||
let mut time: String = "".to_owned();
|
let mut time: String = "".to_owned();
|
||||||
// rewrite subtitle times so they are prefixed in every line
|
// rewrite subtitle times so they are shown as a prefix in every line
|
||||||
for line in BufReader::new(stdo).lines() {
|
for line in BufReader::new(stdo).lines() {
|
||||||
let line = line?;
|
let line = line?;
|
||||||
// 09:55.195 --> 09:56.730
|
// 09:55.195 --> 09:56.730
|
||||||
|
@ -45,7 +45,9 @@ lazy_static! {
|
|||||||
static ref METADATA: AdapterMeta = AdapterMeta {
|
static ref METADATA: AdapterMeta = AdapterMeta {
|
||||||
name: "pandoc".to_owned(),
|
name: "pandoc".to_owned(),
|
||||||
version: 1,
|
version: 1,
|
||||||
description: "Uses pandoc to convert binary/unreadable text documents to plain text markdown-like text".to_owned(),
|
description:
|
||||||
|
"Uses pandoc to convert binary/unreadable text documents to plain markdown-like text"
|
||||||
|
.to_owned(),
|
||||||
fast_matchers: EXTENSIONS
|
fast_matchers: EXTENSIONS
|
||||||
.iter()
|
.iter()
|
||||||
.map(|s| FastMatcher::FileExtension(s.to_string()))
|
.map(|s| FastMatcher::FileExtension(s.to_string()))
|
||||||
|
@ -7,18 +7,16 @@ use lazy_static::lazy_static;
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::BufReader;
|
use std::io::BufReader;
|
||||||
|
|
||||||
|
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::process::Command;
|
use std::process::Command;
|
||||||
|
|
||||||
|
|
||||||
static EXTENSIONS: &[&str] = &["pdf"];
|
static EXTENSIONS: &[&str] = &["pdf"];
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
static ref METADATA: AdapterMeta = AdapterMeta {
|
static ref METADATA: AdapterMeta = AdapterMeta {
|
||||||
name: "pdfpages".to_owned(),
|
name: "pdfpages".to_owned(),
|
||||||
version: 1,
|
version: 1,
|
||||||
description: "Converts a pdf to it's individual pages as png files".to_owned(),
|
description: "Converts a pdf to it's individual pages as png files. Only useful in combination with tesseract".to_owned(),
|
||||||
fast_matchers: EXTENSIONS
|
fast_matchers: EXTENSIONS
|
||||||
.iter()
|
.iter()
|
||||||
.map(|s| FastMatcher::FileExtension(s.to_string()))
|
.map(|s| FastMatcher::FileExtension(s.to_string()))
|
||||||
@ -41,12 +39,13 @@ impl GetMetadata for PdfPagesAdapter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// A pdf is basically converted to a zip that has Page X.png files.
|
||||||
|
/// This way, something like tesseract can process the pages individually
|
||||||
impl FileAdapter for PdfPagesAdapter {
|
impl FileAdapter for PdfPagesAdapter {
|
||||||
fn adapt(&self, ai: AdaptInfo) -> Fallible<()> {
|
fn adapt(&self, ai: AdaptInfo) -> Fallible<()> {
|
||||||
let AdaptInfo {
|
let AdaptInfo {
|
||||||
filepath_hint,
|
filepath_hint,
|
||||||
is_real_file,
|
is_real_file,
|
||||||
inp: _,
|
|
||||||
oup,
|
oup,
|
||||||
line_prefix,
|
line_prefix,
|
||||||
archive_recursion_depth,
|
archive_recursion_depth,
|
||||||
@ -75,7 +74,6 @@ impl FileAdapter for PdfPagesAdapter {
|
|||||||
map_exe_error(e, exe_name, "Make sure you have graphicsmagick installed.")
|
map_exe_error(e, exe_name, "Make sure you have graphicsmagick installed.")
|
||||||
})?;
|
})?;
|
||||||
let args = config.args;
|
let args = config.args;
|
||||||
// TODO: how to handle this copying better?
|
|
||||||
|
|
||||||
let status = cmd.wait()?;
|
let status = cmd.wait()?;
|
||||||
if status.success() {
|
if status.success() {
|
||||||
|
@ -34,6 +34,13 @@ impl GetMetadata for PopplerAdapter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
impl SpawningFileAdapter for PopplerAdapter {
|
impl SpawningFileAdapter for PopplerAdapter {
|
||||||
|
fn get_exe(&self) -> &str {
|
||||||
|
"pdftotext"
|
||||||
|
}
|
||||||
|
fn command(&self, _filepath_hint: &Path, mut cmd: Command) -> Command {
|
||||||
|
cmd.arg("-layout").arg("-").arg("-");
|
||||||
|
cmd
|
||||||
|
}
|
||||||
fn postproc(line_prefix: &str, inp: &mut dyn Read, oup: &mut dyn Write) -> Fallible<()> {
|
fn postproc(line_prefix: &str, inp: &mut dyn Read, oup: &mut dyn Write) -> Fallible<()> {
|
||||||
// prepend Page X to each line
|
// prepend Page X to each line
|
||||||
let mut page = 1;
|
let mut page = 1;
|
||||||
@ -48,11 +55,4 @@ impl SpawningFileAdapter for PopplerAdapter {
|
|||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
fn get_exe(&self) -> &str {
|
|
||||||
"pdftotext"
|
|
||||||
}
|
|
||||||
fn command(&self, _filepath_hint: &Path, mut cmd: Command) -> Command {
|
|
||||||
cmd.arg("-layout").arg("-").arg("-");
|
|
||||||
cmd
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -9,14 +9,15 @@ use std::process::Stdio;
|
|||||||
* Copy a Read to a Write, while prefixing every line with a prefix.
|
* Copy a Read to a Write, while prefixing every line with a prefix.
|
||||||
*
|
*
|
||||||
* Try to detect binary files and ignore them. Does not ensure any encoding in the output.
|
* Try to detect binary files and ignore them. Does not ensure any encoding in the output.
|
||||||
|
*
|
||||||
|
* This is needed because the rg binary detection does not apply to preprocessed files
|
||||||
*/
|
*/
|
||||||
pub fn postproc_line_prefix(
|
pub fn postproc_line_prefix(
|
||||||
line_prefix: &str,
|
line_prefix: &str,
|
||||||
inp: &mut dyn Read,
|
inp: &mut dyn Read,
|
||||||
oup: &mut dyn Write,
|
oup: &mut dyn Write,
|
||||||
) -> Fallible<()> {
|
) -> Fallible<()> {
|
||||||
//std::io::copy(inp, oup)?;
|
// check for null byte in first 8kB
|
||||||
//return Ok(());
|
|
||||||
let mut reader = BufReader::with_capacity(1 << 12, inp);
|
let mut reader = BufReader::with_capacity(1 << 12, inp);
|
||||||
let fourk = reader.fill_buf()?;
|
let fourk = reader.fill_buf()?;
|
||||||
if fourk.contains(&0u8) {
|
if fourk.contains(&0u8) {
|
||||||
@ -45,6 +46,8 @@ pub trait SpawningFileAdapter: GetMetadata {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// replace a Command.spawn() error "File not found" with a more readable error
|
||||||
|
/// to indicate some program is not installed
|
||||||
pub fn map_exe_error(err: std::io::Error, exe_name: &str, help: &str) -> Error {
|
pub fn map_exe_error(err: std::io::Error, exe_name: &str, help: &str) -> Error {
|
||||||
use std::io::ErrorKind::*;
|
use std::io::ErrorKind::*;
|
||||||
match err.kind() {
|
match err.kind() {
|
||||||
@ -70,6 +73,7 @@ pub fn pipe_output(
|
|||||||
let mut stdo = cmd.stdout.take().expect("is piped");
|
let mut stdo = cmd.stdout.take().expect("is piped");
|
||||||
|
|
||||||
// TODO: how to handle this copying better?
|
// TODO: how to handle this copying better?
|
||||||
|
// do we really need threads for this?
|
||||||
crossbeam::scope(|s| -> Fallible<()> {
|
crossbeam::scope(|s| -> Fallible<()> {
|
||||||
s.spawn(|_| cp(line_prefix, &mut stdo, oup).unwrap()); // errors?
|
s.spawn(|_| cp(line_prefix, &mut stdo, oup).unwrap()); // errors?
|
||||||
std::io::copy(inp, &mut stdi)?;
|
std::io::copy(inp, &mut stdi)?;
|
||||||
|
@ -65,6 +65,7 @@ impl FileAdapter for SqliteAdapter {
|
|||||||
..
|
..
|
||||||
} = ai;
|
} = ai;
|
||||||
if !is_real_file {
|
if !is_real_file {
|
||||||
|
// db is in an archive
|
||||||
// todo: read to memory and then use that blob if size < max
|
// todo: read to memory and then use that blob if size < max
|
||||||
writeln!(oup, "{}[rga: skipping sqlite in archive]", line_prefix,)?;
|
writeln!(oup, "{}[rga: skipping sqlite in archive]", line_prefix,)?;
|
||||||
return Ok(());
|
return Ok(());
|
||||||
|
Loading…
Reference in New Issue
Block a user