simplify pagebreaks and add one space

2024-11-24 12:24:56 +00:00 · 2022-12-26 19:28:56 +01:00 · 2022-12-26 19:28:56 +01:00 · 96ebcdef27
commit 96ebcdef27
parent 2d28651bcb
4 changed files with 48 additions and 59 deletions
--- a/src/adapters/custom.rs
+++ b/src/adapters/custom.rs
@ -315,13 +315,13 @@ mod test {
        let o = adapted_to_vec(r).await?;
        assert_eq!(
            String::from_utf8(o)?,
-            "PREFIX:Page 1:hello world
-PREFIX:Page 1:this is just a test.
-PREFIX:Page 1:
-PREFIX:Page 1:1
-PREFIX:Page 1:
-PREFIX:Page 1:
-PREFIX:Page 2:
+            "PREFIX:Page 1: hello world
+PREFIX:Page 1: this is just a test.
+PREFIX:Page 1: 
+PREFIX:Page 1: 1
+PREFIX:Page 1: 
+PREFIX:Page 1: 
+PREFIX:Page 2: 
 "
        );
        Ok(())
--- a/src/adapters/decompress.rs
+++ b/src/adapters/decompress.rs
@ -154,13 +154,13 @@ mod tests {
        let o = adapted_to_vec(r).await?;
        assert_eq!(
            String::from_utf8(o)?,
-            "PREFIX:Page 1:hello world
-PREFIX:Page 1:this is just a test.
-PREFIX:Page 1:
-PREFIX:Page 1:1
-PREFIX:Page 1:
-PREFIX:Page 1:
-PREFIX:Page 2:
+            "PREFIX:Page 1: hello world
+PREFIX:Page 1: this is just a test.
+PREFIX:Page 1: 
+PREFIX:Page 1: 1
+PREFIX:Page 1: 
+PREFIX:Page 1: 
+PREFIX:Page 2: 
 "
        );
        Ok(())
--- a/src/adapters/postproc.rs
+++ b/src/adapters/postproc.rs
@ -170,7 +170,7 @@ impl FileAdapter for PostprocPageBreaks {
        a: super::AdaptInfo,
        _detection_reason: &crate::matching::FileMatcher,
    ) -> Result<AdaptedFilesIterBox> {
-        let read = postproc_pagebreaks("", postproc_encoding(&a.line_prefix, a.inp)?);
+        let read = postproc_pagebreaks(postproc_encoding(&a.line_prefix, a.inp)?);
        // keep adapt info (filename etc) except replace inp
        let ai = AdaptInfo {
            inp: Box::pin(read),
@ -187,33 +187,30 @@ impl FileAdapter for PostprocPageBreaks {
        Ok(one_file(ai))
    }
 }
-/// Adds the prefix "Page N:" to each line,
+/// Adds the prefix "Page N: " to each line,
 /// where N starts at one and is incremented for each ASCII Form Feed character in the input stream.
 /// ASCII form feeds are the page delimiters output by `pdftotext`.
-pub fn postproc_pagebreaks(
-    line_prefix: &str,
-    input: impl AsyncRead + Send,
-) -> impl AsyncRead + Send {
-    let line_prefix_o: String = line_prefix.into();
+pub fn postproc_pagebreaks(input: impl AsyncRead + Send) -> impl AsyncRead + Send {
    let regex_linefeed = regex::bytes::Regex::new(r"\x0c").unwrap();
    let regex_newline = regex::bytes::Regex::new("\n").unwrap();
    let mut page_count: i32 = 1;
-    let mut page_prefix: String = format!("Page {page_count}:{line_prefix_o}");
+    let mut page_prefix: String = format!("Page {page_count}: ");

    let input_stream = ReaderStream::new(input);
    let output_stream = stream! {
-        for await chunk in input_stream {
-            match chunk {
+        for await read_chunk in input_stream {
+            match read_chunk {
                Err(e) => yield Err(e),
                Ok(chunk) => {
-                    let sub_chunks = regex_linefeed.split(&chunk);
-                    for sub_chunk in sub_chunks {
+                    let page_chunks = regex_linefeed.split(&chunk);
+                    for page_chunk in page_chunks {
                        // println!("{}", String::from_utf8_lossy(page_prefix.as_bytes()));
                        yield Ok(Bytes::copy_from_slice(page_prefix.as_bytes()));
-                        page_prefix = format!("\nPage {}:{}", page_count, line_prefix_o);
-                        yield Ok(Bytes::copy_from_slice(&regex_newline.replace_all(sub_chunk, page_prefix.as_bytes())));
+                        page_prefix = format!("\nPage {page_count}: ");
+
+                        yield Ok(Bytes::copy_from_slice(&regex_newline.replace_all(page_chunk, page_prefix.as_bytes())));
                        page_count += 1;
-                        page_prefix = format!("\nPage {}:{}", page_count, line_prefix_o);
+                        page_prefix = format!("\nPage {page_count}: ");
                    }
                }
            }
@ -226,6 +223,7 @@ pub fn postproc_pagebreaks(
 mod tests {
    use super::*;
    use anyhow::Result;
+    use pretty_assertions::assert_eq;
    use tokio::pin;
    use tokio_test::io::Builder;
    use tokio_test::io::Mock;
@ -236,12 +234,12 @@ mod tests {
        let mock: Mock = Builder::new()
            .read(b"Hello\nWorld\x0cFoo Bar\n\x0cTest")
            .build();
-        let res = postproc_pagebreaks("", mock).read_to_end(&mut output).await;
+        let res = postproc_pagebreaks(mock).read_to_end(&mut output).await;
        println!("{}", String::from_utf8_lossy(&output));
        assert!(matches!(res, Ok(_)));
        assert_eq!(
-            output,
-            b"Page 1:Hello\nPage 1:World\nPage 2:Foo Bar\nPage 2:\nPage 3:Test"
+            String::from_utf8_lossy(&output),
+            "Page 1: Hello\nPage 1: World\nPage 2: Foo Bar\nPage 2: \nPage 3: Test"
        );
    }

@ -275,23 +273,14 @@ mod tests {
        let mut oup = Vec::new();
        let inp = postproc_encoding("", a)?;
        if pagebreaks {
-            postproc_pagebreaks(line_prefix, inp)
-                .read_to_end(&mut oup)
-                .await?;
+            postproc_pagebreaks(inp).read_to_end(&mut oup).await?;
        } else {
            let x = postproc_prefix(line_prefix, inp);
            pin!(x);
            x.read_to_end(&mut oup).await?;
        }
        let c = String::from_utf8_lossy(&oup);
-        if b != c {
-            anyhow::bail!(
-                "`{}`\nshould be\n`{}`\nbut is\n`{}`",
-                String::from_utf8_lossy(a),
-                b,
-                c
-            );
-        }
+        assert_eq!(c, b, "source: {}", String::from_utf8_lossy(a));

        Ok(())
    }
@ -299,14 +288,14 @@ mod tests {
    #[tokio::test]
    async fn post1() -> Result<()> {
        let inp = "What is this\nThis is a test\nFoo";
-        let oup = "Page 1:What is this\nPage 1:This is a test\nPage 1:Foo";
+        let oup = "Page 1: What is this\nPage 1: This is a test\nPage 1: Foo";

        test_from_strs(true, "", inp, oup).await?;

        println!("\n\n\n\n");

        let inp = "What is this\nThis is a test\nFoo\x0c\nHelloooo\nHow are you?\x0c\nGreat!";
-        let oup = "Page 1:What is this\nPage 1:This is a test\nPage 1:Foo\nPage 2:\nPage 2:Helloooo\nPage 2:How are you?\nPage 3:\nPage 3:Great!";
+        let oup = "Page 1: What is this\nPage 1: This is a test\nPage 1: Foo\nPage 2: \nPage 2: Helloooo\nPage 2: How are you?\nPage 3: \nPage 3: Great!";

        test_from_strs(true, "", inp, oup).await?;

--- a/src/adapters/tar.rs
+++ b/src/adapters/tar.rs
@ -109,20 +109,20 @@ mod tests {
        let o = adapted_to_vec(r).await.context("adapted_to_vec")?;
        assert_eq!(
            String::from_utf8(o).context("parsing utf8")?,
-            "PREFIX:dir/file-b.pdf: Page 1:hello world
-PREFIX:dir/file-b.pdf: Page 1:this is just a test.
-PREFIX:dir/file-b.pdf: Page 1:
-PREFIX:dir/file-b.pdf: Page 1:1
-PREFIX:dir/file-b.pdf: Page 1:
-PREFIX:dir/file-b.pdf: Page 1:
-PREFIX:dir/file-b.pdf: Page 2:
-PREFIX:dir/file-a.pdf: Page 1:hello world
-PREFIX:dir/file-a.pdf: Page 1:this is just a test.
-PREFIX:dir/file-a.pdf: Page 1:
-PREFIX:dir/file-a.pdf: Page 1:1
-PREFIX:dir/file-a.pdf: Page 1:
-PREFIX:dir/file-a.pdf: Page 1:
-PREFIX:dir/file-a.pdf: Page 2:
+            "PREFIX:dir/file-b.pdf: Page 1: hello world
+PREFIX:dir/file-b.pdf: Page 1: this is just a test.
+PREFIX:dir/file-b.pdf: Page 1: 
+PREFIX:dir/file-b.pdf: Page 1: 1
+PREFIX:dir/file-b.pdf: Page 1: 
+PREFIX:dir/file-b.pdf: Page 1: 
+PREFIX:dir/file-b.pdf: Page 2: 
+PREFIX:dir/file-a.pdf: Page 1: hello world
+PREFIX:dir/file-a.pdf: Page 1: this is just a test.
+PREFIX:dir/file-a.pdf: Page 1: 
+PREFIX:dir/file-a.pdf: Page 1: 1
+PREFIX:dir/file-a.pdf: Page 1: 
+PREFIX:dir/file-a.pdf: Page 1: 
+PREFIX:dir/file-a.pdf: Page 2: 
 "
        );
        Ok(())