diff --git a/src/adapters/custom.rs b/src/adapters/custom.rs index 3444f9d..b4482f6 100644 --- a/src/adapters/custom.rs +++ b/src/adapters/custom.rs @@ -33,7 +33,8 @@ pub struct CustomAdapterConfig { /// {}: the file path (TODO) /// stdin of the program will be connected to the input file, and stdout is assumed to be the converted file pub args: Vec, - // TODO: make adapter filename configurable (?) for inner matching (e.g. foo.tar.gz should be foo.tar after gunzipping) + // TODO: make more flexible for inner matching (e.g. foo.tar.gz should be foo.tar after gunzipping) + pub output_path_hint: Option, } fn strs(arr: &[&str]) -> Vec { @@ -117,6 +118,7 @@ pub struct CustomSpawningFileAdapter { binary: String, args: Vec, meta: AdapterMeta, + output_path_hint: Option, } impl GetMetadata for CustomSpawningFileAdapter { fn metadata(&self) -> &AdapterMeta { @@ -183,6 +185,7 @@ impl CustomAdapterConfig { let ad = CustomSpawningFileAdapter { binary: self.binary.clone(), args: self.args.clone(), + output_path_hint: self.output_path_hint, meta: AdapterMeta { name: self.name.clone(), version: self.version, @@ -192,7 +195,7 @@ impl CustomAdapterConfig { self.binary, self.args.join(" ") ), - recurses: false, + recurses: true, fast_matchers: self .extensions .iter() diff --git a/src/adapters/postproc.rs b/src/adapters/postproc.rs index 2fc677d..7915b87 100644 --- a/src/adapters/postproc.rs +++ b/src/adapters/postproc.rs @@ -15,6 +15,7 @@ use tokio_util::io::ReaderStream; use tokio_util::io::StreamReader; use crate::adapted_iter::AdaptedFilesIterBox; +use crate::matching::FastFileMatcher; use super::{AdaptInfo, AdapterMeta, FileAdapter, GetMetadata}; @@ -30,7 +31,7 @@ impl GetMetadata for PostprocPrefix { name: "postprocprefix".to_owned(), version: 1, description: "Adds the line prefix to each line (e.g. the filename within a zip)".to_owned(), - recurses: true, + recurses: false, fast_matchers: vec![], slow_matchers: None, keep_fast_matchers_if_accurate: false, @@ -143,6 +144,44 @@ pub fn postproc_prefix(line_prefix: &str, inp: impl AsyncRead + Send) -> impl As Box::pin(StreamReader::new(oup_stream)) } + +pub struct PostprocPageBreaks {} +impl GetMetadata for PostprocPageBreaks { + fn metadata(&self) -> &super::AdapterMeta { + lazy_static::lazy_static! { + static ref METADATA: AdapterMeta = AdapterMeta { + name: "postprocpagebreaks".to_owned(), + version: 1, + description: "Adds the page number to each line for an input file that specifies page breaks as ascii page break character".to_owned(), + recurses: false, + fast_matchers: vec![FastFileMatcher::FileExtension("txtwithpagebreaks".to_string())], + slow_matchers: None, + keep_fast_matchers_if_accurate: false, + disabled_by_default: false + }; + } + &METADATA + } +} +impl FileAdapter for PostprocPageBreaks { + fn adapt<'a>( + &self, + a: super::AdaptInfo, + _detection_reason: &crate::matching::FileMatcher, + ) -> Result { + let read = add_newline(postproc_pagebreaks( + &a.line_prefix, + postproc_encoding(&a.line_prefix, a.inp)?, + )); + // keep adapt info (filename etc) except replace inp + let ai = AdaptInfo { + inp: Box::pin(read), + postprocess: false, + ..a + }; + Ok(Box::pin(tokio_stream::once(ai))) + } +} /// Adds the prefix "Page N:" to each line, /// where N starts at one and is incremented for each ASCII Form Feed character in the input stream. /// ASCII form feeds are the page delimiters output by `pdftotext`. @@ -184,6 +223,7 @@ mod tests { use anyhow::Result; use tokio_test::io::Builder; use tokio_test::io::Mock; + use tokio::pin; #[tokio::test] async fn test_with_pagebreaks() { diff --git a/src/adapters/spawning.rs b/src/adapters/spawning.rs index 6f71915..a99d6dd 100644 --- a/src/adapters/spawning.rs +++ b/src/adapters/spawning.rs @@ -6,6 +6,7 @@ use log::*; use tokio_util::io::StreamReader; use crate::adapters::FileAdapter; +use crate::expand::expand_str_ez; use std::future::Future; use std::path::Path; use std::process::{ExitStatus, Stdio}; @@ -123,7 +124,11 @@ impl FileAdapter for SpawningFileAdapter { debug!("executing {:?}", cmd); let output = pipe_output(&line_prefix, cmd, inp, self.inner.get_exe(), "")?; Ok(Box::pin(tokio_stream::once(AdaptInfo { - filepath_hint: PathBuf::from(format!("{}.txt", filepath_hint.to_string_lossy())), // TODO: customizable + filepath_hint: PathBuf::from( + expand_str_ez(self.inner.output_path_hint, |r| match r { + "fullname" => &filepath_hint.to_string_lossy() + } + )), inp: output, line_prefix, is_real_file: false, diff --git a/src/expand.rs b/src/expand.rs new file mode 100644 index 0000000..f2ee294 --- /dev/null +++ b/src/expand.rs @@ -0,0 +1,146 @@ +use regex::Captures; + +// from https://github.com/phiresky/timetrackrs/blob/1c3df09ba2c1fda6065f2927045bd28dea0738d3/src/expand.rs + +pub fn find_byte(needle: u8, haystack: &[u8]) -> Option { + #[cfg(not(feature = "perf-literal"))] + fn imp(needle: u8, haystack: &[u8]) -> Option { + haystack.iter().position(|&b| b == needle) + } + + #[cfg(feature = "perf-literal")] + fn imp(needle: u8, haystack: &[u8]) -> Option { + use memchr::memchr; + memchr(needle, haystack) + } + + imp(needle, haystack) +} + +pub fn get_capture<'a>(caps: &'a [Captures], reference: &str) -> Option<&'a str> { + caps.iter() + .flat_map(|caps| caps.name(reference)) + .next() + .map(|m| m.as_str()) +} + +pub fn expand_str_captures(caps: &[Captures], replacement: &str) -> String { + let mut dst = String::new(); + expand_str_lambda( + |reference: &str| get_capture(caps, reference).unwrap_or(""), + replacement, + &mut dst, + ); + dst +} + +pub fn expand_str_ez<'a, F>(replacement: &'a str, lambda: F) -> String +where + F: Fn(&str) -> &'a str, +{ + let mut dst = String::new(); + expand_str_lambda(lambda, replacement, &mut dst); + dst +} + +pub fn expand_str_lambda<'a, F>(cap: F, replacement: &'a str, dst: &mut String) +where + F: Fn(&str) -> &'a str, +{ + let mut replacement = replacement; + while !replacement.is_empty() { + match find_byte(b'$', replacement.as_bytes()) { + None => break, + Some(i) => { + dst.push_str(&replacement[..i]); + replacement = &replacement[i..]; + } + } + if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') { + dst.push('$'); + replacement = &replacement[2..]; + continue; + } + debug_assert!(!replacement.is_empty()); + let cap_ref = match find_cap_ref(replacement.as_bytes()) { + Some(cap_ref) => cap_ref, + None => { + dst.push('$'); + replacement = &replacement[1..]; + continue; + } + }; + replacement = &replacement[cap_ref.end..]; + dst.push_str(cap(cap_ref.cap)); + } + dst.push_str(replacement); +} + +/// `CaptureRef` represents a reference to a capture group inside some text. +/// The reference is either a capture group name or a number. +/// +/// It is also tagged with the position in the text following the +/// capture reference. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct CaptureRef<'a> { + cap: &'a str, + end: usize, +} + +/// Parses a possible reference to a capture group name in the given text, +/// starting at the beginning of `replacement`. +/// +/// If no such valid reference could be found, None is returned. +fn find_cap_ref(replacement: &[u8]) -> Option { + let mut i = 0; + let rep: &[u8] = replacement; + if rep.len() <= 1 || rep[0] != b'$' { + return None; + } + i += 1; + if rep[i] == b'{' { + return find_cap_ref_braced(rep, i + 1); + } + let mut cap_end = i; + while rep.get(cap_end).map_or(false, is_valid_cap_letter) { + cap_end += 1; + } + if cap_end == i { + return None; + } + // We just verified that the range 0..cap_end is valid ASCII, so it must + // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 + // check with either unsafe or by parsing the number straight from &[u8]. + let cap = std::str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name"); + Some(CaptureRef { + cap: &cap, + end: cap_end, + }) +} + +fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option { + let start = i; + while rep.get(i).map_or(false, |&b| b != b'}') { + i += 1; + } + if !rep.get(i).map_or(false, |&b| b == b'}') { + return None; + } + // When looking at braced names, we don't put any restrictions on the name, + // so it's possible it could be invalid UTF-8. But a capture group name + // can never be invalid UTF-8, so if we have invalid UTF-8, then we can + // safely return None. + let cap = match std::str::from_utf8(&rep[start..i]) { + Err(_) => return None, + Ok(cap) => cap, + }; + Some(CaptureRef { + cap: &cap, + end: i + 1, + }) +} + +/// Returns true if and only if the given byte is allowed in a capture name. +fn is_valid_cap_letter(b: &u8) -> bool { + matches!(b, b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_') +} diff --git a/src/lib.rs b/src/lib.rs index 33b8c1a..3601486 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,6 +9,7 @@ pub mod pipe; pub mod preproc; pub mod preproc_cache; pub mod recurse; +pub mod expand; #[cfg(test)] pub mod test_utils; use anyhow::Context;