mirror of
https://github.com/FliegendeWurst/ripgrep-all.git
synced 2024-11-24 12:24:56 +00:00
add page breaks adapter (wip)
This commit is contained in:
parent
ea5d6c3367
commit
6f3488682f
@ -33,7 +33,8 @@ pub struct CustomAdapterConfig {
|
|||||||
/// {}: the file path (TODO)
|
/// {}: the file path (TODO)
|
||||||
/// stdin of the program will be connected to the input file, and stdout is assumed to be the converted file
|
/// stdin of the program will be connected to the input file, and stdout is assumed to be the converted file
|
||||||
pub args: Vec<String>,
|
pub args: Vec<String>,
|
||||||
// TODO: make adapter filename configurable (?) for inner matching (e.g. foo.tar.gz should be foo.tar after gunzipping)
|
// TODO: make more flexible for inner matching (e.g. foo.tar.gz should be foo.tar after gunzipping)
|
||||||
|
pub output_path_hint: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn strs(arr: &[&str]) -> Vec<String> {
|
fn strs(arr: &[&str]) -> Vec<String> {
|
||||||
@ -117,6 +118,7 @@ pub struct CustomSpawningFileAdapter {
|
|||||||
binary: String,
|
binary: String,
|
||||||
args: Vec<String>,
|
args: Vec<String>,
|
||||||
meta: AdapterMeta,
|
meta: AdapterMeta,
|
||||||
|
output_path_hint: Option<String>,
|
||||||
}
|
}
|
||||||
impl GetMetadata for CustomSpawningFileAdapter {
|
impl GetMetadata for CustomSpawningFileAdapter {
|
||||||
fn metadata(&self) -> &AdapterMeta {
|
fn metadata(&self) -> &AdapterMeta {
|
||||||
@ -183,6 +185,7 @@ impl CustomAdapterConfig {
|
|||||||
let ad = CustomSpawningFileAdapter {
|
let ad = CustomSpawningFileAdapter {
|
||||||
binary: self.binary.clone(),
|
binary: self.binary.clone(),
|
||||||
args: self.args.clone(),
|
args: self.args.clone(),
|
||||||
|
output_path_hint: self.output_path_hint,
|
||||||
meta: AdapterMeta {
|
meta: AdapterMeta {
|
||||||
name: self.name.clone(),
|
name: self.name.clone(),
|
||||||
version: self.version,
|
version: self.version,
|
||||||
@ -192,7 +195,7 @@ impl CustomAdapterConfig {
|
|||||||
self.binary,
|
self.binary,
|
||||||
self.args.join(" ")
|
self.args.join(" ")
|
||||||
),
|
),
|
||||||
recurses: false,
|
recurses: true,
|
||||||
fast_matchers: self
|
fast_matchers: self
|
||||||
.extensions
|
.extensions
|
||||||
.iter()
|
.iter()
|
||||||
|
@ -15,6 +15,7 @@ use tokio_util::io::ReaderStream;
|
|||||||
use tokio_util::io::StreamReader;
|
use tokio_util::io::StreamReader;
|
||||||
|
|
||||||
use crate::adapted_iter::AdaptedFilesIterBox;
|
use crate::adapted_iter::AdaptedFilesIterBox;
|
||||||
|
use crate::matching::FastFileMatcher;
|
||||||
|
|
||||||
use super::{AdaptInfo, AdapterMeta, FileAdapter, GetMetadata};
|
use super::{AdaptInfo, AdapterMeta, FileAdapter, GetMetadata};
|
||||||
|
|
||||||
@ -30,7 +31,7 @@ impl GetMetadata for PostprocPrefix {
|
|||||||
name: "postprocprefix".to_owned(),
|
name: "postprocprefix".to_owned(),
|
||||||
version: 1,
|
version: 1,
|
||||||
description: "Adds the line prefix to each line (e.g. the filename within a zip)".to_owned(),
|
description: "Adds the line prefix to each line (e.g. the filename within a zip)".to_owned(),
|
||||||
recurses: true,
|
recurses: false,
|
||||||
fast_matchers: vec![],
|
fast_matchers: vec![],
|
||||||
slow_matchers: None,
|
slow_matchers: None,
|
||||||
keep_fast_matchers_if_accurate: false,
|
keep_fast_matchers_if_accurate: false,
|
||||||
@ -143,6 +144,44 @@ pub fn postproc_prefix(line_prefix: &str, inp: impl AsyncRead + Send) -> impl As
|
|||||||
Box::pin(StreamReader::new(oup_stream))
|
Box::pin(StreamReader::new(oup_stream))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
pub struct PostprocPageBreaks {}
|
||||||
|
impl GetMetadata for PostprocPageBreaks {
|
||||||
|
fn metadata(&self) -> &super::AdapterMeta {
|
||||||
|
lazy_static::lazy_static! {
|
||||||
|
static ref METADATA: AdapterMeta = AdapterMeta {
|
||||||
|
name: "postprocpagebreaks".to_owned(),
|
||||||
|
version: 1,
|
||||||
|
description: "Adds the page number to each line for an input file that specifies page breaks as ascii page break character".to_owned(),
|
||||||
|
recurses: false,
|
||||||
|
fast_matchers: vec![FastFileMatcher::FileExtension("txtwithpagebreaks".to_string())],
|
||||||
|
slow_matchers: None,
|
||||||
|
keep_fast_matchers_if_accurate: false,
|
||||||
|
disabled_by_default: false
|
||||||
|
};
|
||||||
|
}
|
||||||
|
&METADATA
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl FileAdapter for PostprocPageBreaks {
|
||||||
|
fn adapt<'a>(
|
||||||
|
&self,
|
||||||
|
a: super::AdaptInfo,
|
||||||
|
_detection_reason: &crate::matching::FileMatcher,
|
||||||
|
) -> Result<AdaptedFilesIterBox> {
|
||||||
|
let read = add_newline(postproc_pagebreaks(
|
||||||
|
&a.line_prefix,
|
||||||
|
postproc_encoding(&a.line_prefix, a.inp)?,
|
||||||
|
));
|
||||||
|
// keep adapt info (filename etc) except replace inp
|
||||||
|
let ai = AdaptInfo {
|
||||||
|
inp: Box::pin(read),
|
||||||
|
postprocess: false,
|
||||||
|
..a
|
||||||
|
};
|
||||||
|
Ok(Box::pin(tokio_stream::once(ai)))
|
||||||
|
}
|
||||||
|
}
|
||||||
/// Adds the prefix "Page N:" to each line,
|
/// Adds the prefix "Page N:" to each line,
|
||||||
/// where N starts at one and is incremented for each ASCII Form Feed character in the input stream.
|
/// where N starts at one and is incremented for each ASCII Form Feed character in the input stream.
|
||||||
/// ASCII form feeds are the page delimiters output by `pdftotext`.
|
/// ASCII form feeds are the page delimiters output by `pdftotext`.
|
||||||
@ -184,6 +223,7 @@ mod tests {
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use tokio_test::io::Builder;
|
use tokio_test::io::Builder;
|
||||||
use tokio_test::io::Mock;
|
use tokio_test::io::Mock;
|
||||||
|
use tokio::pin;
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_with_pagebreaks() {
|
async fn test_with_pagebreaks() {
|
||||||
|
@ -6,6 +6,7 @@ use log::*;
|
|||||||
use tokio_util::io::StreamReader;
|
use tokio_util::io::StreamReader;
|
||||||
|
|
||||||
use crate::adapters::FileAdapter;
|
use crate::adapters::FileAdapter;
|
||||||
|
use crate::expand::expand_str_ez;
|
||||||
use std::future::Future;
|
use std::future::Future;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::process::{ExitStatus, Stdio};
|
use std::process::{ExitStatus, Stdio};
|
||||||
@ -123,7 +124,11 @@ impl FileAdapter for SpawningFileAdapter {
|
|||||||
debug!("executing {:?}", cmd);
|
debug!("executing {:?}", cmd);
|
||||||
let output = pipe_output(&line_prefix, cmd, inp, self.inner.get_exe(), "")?;
|
let output = pipe_output(&line_prefix, cmd, inp, self.inner.get_exe(), "")?;
|
||||||
Ok(Box::pin(tokio_stream::once(AdaptInfo {
|
Ok(Box::pin(tokio_stream::once(AdaptInfo {
|
||||||
filepath_hint: PathBuf::from(format!("{}.txt", filepath_hint.to_string_lossy())), // TODO: customizable
|
filepath_hint: PathBuf::from(
|
||||||
|
expand_str_ez(self.inner.output_path_hint, |r| match r {
|
||||||
|
"fullname" => &filepath_hint.to_string_lossy()
|
||||||
|
}
|
||||||
|
)),
|
||||||
inp: output,
|
inp: output,
|
||||||
line_prefix,
|
line_prefix,
|
||||||
is_real_file: false,
|
is_real_file: false,
|
||||||
|
146
src/expand.rs
Normal file
146
src/expand.rs
Normal file
@ -0,0 +1,146 @@
|
|||||||
|
use regex::Captures;
|
||||||
|
|
||||||
|
// from https://github.com/phiresky/timetrackrs/blob/1c3df09ba2c1fda6065f2927045bd28dea0738d3/src/expand.rs
|
||||||
|
|
||||||
|
pub fn find_byte(needle: u8, haystack: &[u8]) -> Option<usize> {
|
||||||
|
#[cfg(not(feature = "perf-literal"))]
|
||||||
|
fn imp(needle: u8, haystack: &[u8]) -> Option<usize> {
|
||||||
|
haystack.iter().position(|&b| b == needle)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "perf-literal")]
|
||||||
|
fn imp(needle: u8, haystack: &[u8]) -> Option<usize> {
|
||||||
|
use memchr::memchr;
|
||||||
|
memchr(needle, haystack)
|
||||||
|
}
|
||||||
|
|
||||||
|
imp(needle, haystack)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_capture<'a>(caps: &'a [Captures], reference: &str) -> Option<&'a str> {
|
||||||
|
caps.iter()
|
||||||
|
.flat_map(|caps| caps.name(reference))
|
||||||
|
.next()
|
||||||
|
.map(|m| m.as_str())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn expand_str_captures(caps: &[Captures], replacement: &str) -> String {
|
||||||
|
let mut dst = String::new();
|
||||||
|
expand_str_lambda(
|
||||||
|
|reference: &str| get_capture(caps, reference).unwrap_or(""),
|
||||||
|
replacement,
|
||||||
|
&mut dst,
|
||||||
|
);
|
||||||
|
dst
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn expand_str_ez<'a, F>(replacement: &'a str, lambda: F) -> String
|
||||||
|
where
|
||||||
|
F: Fn(&str) -> &'a str,
|
||||||
|
{
|
||||||
|
let mut dst = String::new();
|
||||||
|
expand_str_lambda(lambda, replacement, &mut dst);
|
||||||
|
dst
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn expand_str_lambda<'a, F>(cap: F, replacement: &'a str, dst: &mut String)
|
||||||
|
where
|
||||||
|
F: Fn(&str) -> &'a str,
|
||||||
|
{
|
||||||
|
let mut replacement = replacement;
|
||||||
|
while !replacement.is_empty() {
|
||||||
|
match find_byte(b'$', replacement.as_bytes()) {
|
||||||
|
None => break,
|
||||||
|
Some(i) => {
|
||||||
|
dst.push_str(&replacement[..i]);
|
||||||
|
replacement = &replacement[i..];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') {
|
||||||
|
dst.push('$');
|
||||||
|
replacement = &replacement[2..];
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
debug_assert!(!replacement.is_empty());
|
||||||
|
let cap_ref = match find_cap_ref(replacement.as_bytes()) {
|
||||||
|
Some(cap_ref) => cap_ref,
|
||||||
|
None => {
|
||||||
|
dst.push('$');
|
||||||
|
replacement = &replacement[1..];
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
replacement = &replacement[cap_ref.end..];
|
||||||
|
dst.push_str(cap(cap_ref.cap));
|
||||||
|
}
|
||||||
|
dst.push_str(replacement);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `CaptureRef` represents a reference to a capture group inside some text.
|
||||||
|
/// The reference is either a capture group name or a number.
|
||||||
|
///
|
||||||
|
/// It is also tagged with the position in the text following the
|
||||||
|
/// capture reference.
|
||||||
|
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||||
|
struct CaptureRef<'a> {
|
||||||
|
cap: &'a str,
|
||||||
|
end: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parses a possible reference to a capture group name in the given text,
|
||||||
|
/// starting at the beginning of `replacement`.
|
||||||
|
///
|
||||||
|
/// If no such valid reference could be found, None is returned.
|
||||||
|
fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef> {
|
||||||
|
let mut i = 0;
|
||||||
|
let rep: &[u8] = replacement;
|
||||||
|
if rep.len() <= 1 || rep[0] != b'$' {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
i += 1;
|
||||||
|
if rep[i] == b'{' {
|
||||||
|
return find_cap_ref_braced(rep, i + 1);
|
||||||
|
}
|
||||||
|
let mut cap_end = i;
|
||||||
|
while rep.get(cap_end).map_or(false, is_valid_cap_letter) {
|
||||||
|
cap_end += 1;
|
||||||
|
}
|
||||||
|
if cap_end == i {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
// We just verified that the range 0..cap_end is valid ASCII, so it must
|
||||||
|
// therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
|
||||||
|
// check with either unsafe or by parsing the number straight from &[u8].
|
||||||
|
let cap = std::str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name");
|
||||||
|
Some(CaptureRef {
|
||||||
|
cap: &cap,
|
||||||
|
end: cap_end,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef> {
|
||||||
|
let start = i;
|
||||||
|
while rep.get(i).map_or(false, |&b| b != b'}') {
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
if !rep.get(i).map_or(false, |&b| b == b'}') {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
// When looking at braced names, we don't put any restrictions on the name,
|
||||||
|
// so it's possible it could be invalid UTF-8. But a capture group name
|
||||||
|
// can never be invalid UTF-8, so if we have invalid UTF-8, then we can
|
||||||
|
// safely return None.
|
||||||
|
let cap = match std::str::from_utf8(&rep[start..i]) {
|
||||||
|
Err(_) => return None,
|
||||||
|
Ok(cap) => cap,
|
||||||
|
};
|
||||||
|
Some(CaptureRef {
|
||||||
|
cap: &cap,
|
||||||
|
end: i + 1,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if and only if the given byte is allowed in a capture name.
|
||||||
|
fn is_valid_cap_letter(b: &u8) -> bool {
|
||||||
|
matches!(b, b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_')
|
||||||
|
}
|
@ -9,6 +9,7 @@ pub mod pipe;
|
|||||||
pub mod preproc;
|
pub mod preproc;
|
||||||
pub mod preproc_cache;
|
pub mod preproc_cache;
|
||||||
pub mod recurse;
|
pub mod recurse;
|
||||||
|
pub mod expand;
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub mod test_utils;
|
pub mod test_utils;
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
|
Loading…
Reference in New Issue
Block a user