add page breaks adapter (wip)

2024-11-24 04:14:57 +00:00 · 2022-12-25 17:37:31 +01:00 · 2022-12-25 17:37:31 +01:00 · 6f3488682f
commit 6f3488682f
parent ea5d6c3367
5 changed files with 199 additions and 4 deletions
--- a/src/adapters/custom.rs
+++ b/src/adapters/custom.rs
@ -33,7 +33,8 @@ pub struct CustomAdapterConfig {
    /// {}: the file path (TODO)
    /// stdin of the program will be connected to the input file, and stdout is assumed to be the converted file
    pub args: Vec<String>,
-    // TODO: make adapter filename configurable (?) for inner matching (e.g. foo.tar.gz should be foo.tar after gunzipping)
+    // TODO: make more flexible for inner matching (e.g. foo.tar.gz should be foo.tar after gunzipping)
+    pub output_path_hint: Option<String>,
 }

 fn strs(arr: &[&str]) -> Vec<String> {
@ -117,6 +118,7 @@ pub struct CustomSpawningFileAdapter {
    binary: String,
    args: Vec<String>,
    meta: AdapterMeta,
+    output_path_hint: Option<String>,
 }
 impl GetMetadata for CustomSpawningFileAdapter {
    fn metadata(&self) -> &AdapterMeta {
@ -183,6 +185,7 @@ impl CustomAdapterConfig {
        let ad = CustomSpawningFileAdapter {
            binary: self.binary.clone(),
            args: self.args.clone(),
+            output_path_hint: self.output_path_hint,
            meta: AdapterMeta {
                name: self.name.clone(),
                version: self.version,
@ -192,7 +195,7 @@ impl CustomAdapterConfig {
                    self.binary,
                    self.args.join(" ")
                ),
-                recurses: false,
+                recurses: true,
                fast_matchers: self
                    .extensions
                    .iter()
--- a/src/adapters/postproc.rs
+++ b/src/adapters/postproc.rs
@ -15,6 +15,7 @@ use tokio_util::io::ReaderStream;
 use tokio_util::io::StreamReader;

 use crate::adapted_iter::AdaptedFilesIterBox;
+use crate::matching::FastFileMatcher;

 use super::{AdaptInfo, AdapterMeta, FileAdapter, GetMetadata};

@ -30,7 +31,7 @@ impl GetMetadata for PostprocPrefix {
                name: "postprocprefix".to_owned(),
                version: 1,
                description: "Adds the line prefix to each line (e.g. the filename within a zip)".to_owned(),
-                recurses: true,
+                recurses: false,
                fast_matchers: vec![],
                slow_matchers: None,
                keep_fast_matchers_if_accurate: false,
@ -143,6 +144,44 @@ pub fn postproc_prefix(line_prefix: &str, inp: impl AsyncRead + Send) -> impl As
    Box::pin(StreamReader::new(oup_stream))
 }

+
+pub struct PostprocPageBreaks {}
+impl GetMetadata for PostprocPageBreaks {
+    fn metadata(&self) -> &super::AdapterMeta {
+        lazy_static::lazy_static! {
+            static ref METADATA: AdapterMeta = AdapterMeta {
+                name: "postprocpagebreaks".to_owned(),
+                version: 1,
+                description: "Adds the page number to each line for an input file that specifies page breaks as ascii page break character".to_owned(),
+                recurses: false,
+                fast_matchers: vec![FastFileMatcher::FileExtension("txtwithpagebreaks".to_string())],
+                slow_matchers: None,
+                keep_fast_matchers_if_accurate: false,
+                disabled_by_default: false
+            };
+        }
+        &METADATA
+    }
+}
+impl FileAdapter for PostprocPageBreaks {
+    fn adapt<'a>(
+        &self,
+        a: super::AdaptInfo,
+        _detection_reason: &crate::matching::FileMatcher,
+    ) -> Result<AdaptedFilesIterBox> {
+        let read = add_newline(postproc_pagebreaks(
+            &a.line_prefix,
+            postproc_encoding(&a.line_prefix, a.inp)?,
+        ));
+        // keep adapt info (filename etc) except replace inp
+        let ai = AdaptInfo {
+            inp: Box::pin(read),
+            postprocess: false,
+            ..a
+        };
+        Ok(Box::pin(tokio_stream::once(ai)))
+    }
+}
 /// Adds the prefix "Page N:" to each line,
 /// where N starts at one and is incremented for each ASCII Form Feed character in the input stream.
 /// ASCII form feeds are the page delimiters output by `pdftotext`.
@ -184,6 +223,7 @@ mod tests {
    use anyhow::Result;
    use tokio_test::io::Builder;
    use tokio_test::io::Mock;
+    use tokio::pin;

    #[tokio::test]
    async fn test_with_pagebreaks() {
--- a/src/adapters/spawning.rs
+++ b/src/adapters/spawning.rs
@ -6,6 +6,7 @@ use log::*;
 use tokio_util::io::StreamReader;

 use crate::adapters::FileAdapter;
+use crate::expand::expand_str_ez;
 use std::future::Future;
 use std::path::Path;
 use std::process::{ExitStatus, Stdio};
@ -123,7 +124,11 @@ impl FileAdapter for SpawningFileAdapter {
        debug!("executing {:?}", cmd);
        let output = pipe_output(&line_prefix, cmd, inp, self.inner.get_exe(), "")?;
        Ok(Box::pin(tokio_stream::once(AdaptInfo {
-            filepath_hint: PathBuf::from(format!("{}.txt", filepath_hint.to_string_lossy())), // TODO: customizable
+            filepath_hint: PathBuf::from(
+                expand_str_ez(self.inner.output_path_hint, |r| match r {
+                    "fullname" => &filepath_hint.to_string_lossy()
+                }
+            )),
            inp: output,
            line_prefix,
            is_real_file: false,
--- a/src/expand.rs
+++ b/src/expand.rs
@ -0,0 +1,146 @@
+use regex::Captures;
+
+// from https://github.com/phiresky/timetrackrs/blob/1c3df09ba2c1fda6065f2927045bd28dea0738d3/src/expand.rs
+
+pub fn find_byte(needle: u8, haystack: &[u8]) -> Option<usize> {
+    #[cfg(not(feature = "perf-literal"))]
+    fn imp(needle: u8, haystack: &[u8]) -> Option<usize> {
+        haystack.iter().position(|&b| b == needle)
+    }
+
+    #[cfg(feature = "perf-literal")]
+    fn imp(needle: u8, haystack: &[u8]) -> Option<usize> {
+        use memchr::memchr;
+        memchr(needle, haystack)
+    }
+
+    imp(needle, haystack)
+}
+
+pub fn get_capture<'a>(caps: &'a [Captures], reference: &str) -> Option<&'a str> {
+    caps.iter()
+        .flat_map(|caps| caps.name(reference))
+        .next()
+        .map(|m| m.as_str())
+}
+
+pub fn expand_str_captures(caps: &[Captures], replacement: &str) -> String {
+    let mut dst = String::new();
+    expand_str_lambda(
+        |reference: &str| get_capture(caps, reference).unwrap_or(""),
+        replacement,
+        &mut dst,
+    );
+    dst
+}
+
+pub fn expand_str_ez<'a, F>(replacement: &'a str, lambda: F) -> String
+where
+    F: Fn(&str) -> &'a str,
+{
+    let mut dst = String::new();
+    expand_str_lambda(lambda, replacement, &mut dst);
+    dst
+}
+
+pub fn expand_str_lambda<'a, F>(cap: F, replacement: &'a str, dst: &mut String)
+where
+    F: Fn(&str) -> &'a str,
+{
+    let mut replacement = replacement;
+    while !replacement.is_empty() {
+        match find_byte(b'$', replacement.as_bytes()) {
+            None => break,
+            Some(i) => {
+                dst.push_str(&replacement[..i]);
+                replacement = &replacement[i..];
+            }
+        }
+        if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') {
+            dst.push('$');
+            replacement = &replacement[2..];
+            continue;
+        }
+        debug_assert!(!replacement.is_empty());
+        let cap_ref = match find_cap_ref(replacement.as_bytes()) {
+            Some(cap_ref) => cap_ref,
+            None => {
+                dst.push('$');
+                replacement = &replacement[1..];
+                continue;
+            }
+        };
+        replacement = &replacement[cap_ref.end..];
+        dst.push_str(cap(cap_ref.cap));
+    }
+    dst.push_str(replacement);
+}
+
+/// `CaptureRef` represents a reference to a capture group inside some text.
+/// The reference is either a capture group name or a number.
+///
+/// It is also tagged with the position in the text following the
+/// capture reference.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+struct CaptureRef<'a> {
+    cap: &'a str,
+    end: usize,
+}
+
+/// Parses a possible reference to a capture group name in the given text,
+/// starting at the beginning of `replacement`.
+///
+/// If no such valid reference could be found, None is returned.
+fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef> {
+    let mut i = 0;
+    let rep: &[u8] = replacement;
+    if rep.len() <= 1 || rep[0] != b'$' {
+        return None;
+    }
+    i += 1;
+    if rep[i] == b'{' {
+        return find_cap_ref_braced(rep, i + 1);
+    }
+    let mut cap_end = i;
+    while rep.get(cap_end).map_or(false, is_valid_cap_letter) {
+        cap_end += 1;
+    }
+    if cap_end == i {
+        return None;
+    }
+    // We just verified that the range 0..cap_end is valid ASCII, so it must
+    // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
+    // check with either unsafe or by parsing the number straight from &[u8].
+    let cap = std::str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name");
+    Some(CaptureRef {
+        cap: &cap,
+        end: cap_end,
+    })
+}
+
+fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef> {
+    let start = i;
+    while rep.get(i).map_or(false, |&b| b != b'}') {
+        i += 1;
+    }
+    if !rep.get(i).map_or(false, |&b| b == b'}') {
+        return None;
+    }
+    // When looking at braced names, we don't put any restrictions on the name,
+    // so it's possible it could be invalid UTF-8. But a capture group name
+    // can never be invalid UTF-8, so if we have invalid UTF-8, then we can
+    // safely return None.
+    let cap = match std::str::from_utf8(&rep[start..i]) {
+        Err(_) => return None,
+        Ok(cap) => cap,
+    };
+    Some(CaptureRef {
+        cap: &cap,
+        end: i + 1,
+    })
+}
+
+/// Returns true if and only if the given byte is allowed in a capture name.
+fn is_valid_cap_letter(b: &u8) -> bool {
+    matches!(b, b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_')
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -9,6 +9,7 @@ pub mod pipe;
 pub mod preproc;
 pub mod preproc_cache;
 pub mod recurse;
+pub mod expand;
 #[cfg(test)]
 pub mod test_utils;
 use anyhow::Context;