From 96ebcdef2720b0604eae8b2f2e79a8d5ac15a3be Mon Sep 17 00:00:00 2001 From: phiresky Date: Mon, 26 Dec 2022 19:28:56 +0100 Subject: [PATCH] simplify pagebreaks and add one space --- src/adapters/custom.rs | 14 +++++------ src/adapters/decompress.rs | 14 +++++------ src/adapters/postproc.rs | 51 +++++++++++++++----------------------- src/adapters/tar.rs | 28 ++++++++++----------- 4 files changed, 48 insertions(+), 59 deletions(-) diff --git a/src/adapters/custom.rs b/src/adapters/custom.rs index 73dfc98..22c4fa4 100644 --- a/src/adapters/custom.rs +++ b/src/adapters/custom.rs @@ -315,13 +315,13 @@ mod test { let o = adapted_to_vec(r).await?; assert_eq!( String::from_utf8(o)?, - "PREFIX:Page 1:hello world -PREFIX:Page 1:this is just a test. -PREFIX:Page 1: -PREFIX:Page 1:1 -PREFIX:Page 1: -PREFIX:Page 1: -PREFIX:Page 2: + "PREFIX:Page 1: hello world +PREFIX:Page 1: this is just a test. +PREFIX:Page 1: +PREFIX:Page 1: 1 +PREFIX:Page 1: +PREFIX:Page 1: +PREFIX:Page 2: " ); Ok(()) diff --git a/src/adapters/decompress.rs b/src/adapters/decompress.rs index 94cab7c..f7e7acf 100644 --- a/src/adapters/decompress.rs +++ b/src/adapters/decompress.rs @@ -154,13 +154,13 @@ mod tests { let o = adapted_to_vec(r).await?; assert_eq!( String::from_utf8(o)?, - "PREFIX:Page 1:hello world -PREFIX:Page 1:this is just a test. -PREFIX:Page 1: -PREFIX:Page 1:1 -PREFIX:Page 1: -PREFIX:Page 1: -PREFIX:Page 2: + "PREFIX:Page 1: hello world +PREFIX:Page 1: this is just a test. +PREFIX:Page 1: +PREFIX:Page 1: 1 +PREFIX:Page 1: +PREFIX:Page 1: +PREFIX:Page 2: " ); Ok(()) diff --git a/src/adapters/postproc.rs b/src/adapters/postproc.rs index 9fd6841..40ff205 100644 --- a/src/adapters/postproc.rs +++ b/src/adapters/postproc.rs @@ -170,7 +170,7 @@ impl FileAdapter for PostprocPageBreaks { a: super::AdaptInfo, _detection_reason: &crate::matching::FileMatcher, ) -> Result { - let read = postproc_pagebreaks("", postproc_encoding(&a.line_prefix, a.inp)?); + let read = postproc_pagebreaks(postproc_encoding(&a.line_prefix, a.inp)?); // keep adapt info (filename etc) except replace inp let ai = AdaptInfo { inp: Box::pin(read), @@ -187,33 +187,30 @@ impl FileAdapter for PostprocPageBreaks { Ok(one_file(ai)) } } -/// Adds the prefix "Page N:" to each line, +/// Adds the prefix "Page N: " to each line, /// where N starts at one and is incremented for each ASCII Form Feed character in the input stream. /// ASCII form feeds are the page delimiters output by `pdftotext`. -pub fn postproc_pagebreaks( - line_prefix: &str, - input: impl AsyncRead + Send, -) -> impl AsyncRead + Send { - let line_prefix_o: String = line_prefix.into(); +pub fn postproc_pagebreaks(input: impl AsyncRead + Send) -> impl AsyncRead + Send { let regex_linefeed = regex::bytes::Regex::new(r"\x0c").unwrap(); let regex_newline = regex::bytes::Regex::new("\n").unwrap(); let mut page_count: i32 = 1; - let mut page_prefix: String = format!("Page {page_count}:{line_prefix_o}"); + let mut page_prefix: String = format!("Page {page_count}: "); let input_stream = ReaderStream::new(input); let output_stream = stream! { - for await chunk in input_stream { - match chunk { + for await read_chunk in input_stream { + match read_chunk { Err(e) => yield Err(e), Ok(chunk) => { - let sub_chunks = regex_linefeed.split(&chunk); - for sub_chunk in sub_chunks { + let page_chunks = regex_linefeed.split(&chunk); + for page_chunk in page_chunks { // println!("{}", String::from_utf8_lossy(page_prefix.as_bytes())); yield Ok(Bytes::copy_from_slice(page_prefix.as_bytes())); - page_prefix = format!("\nPage {}:{}", page_count, line_prefix_o); - yield Ok(Bytes::copy_from_slice(®ex_newline.replace_all(sub_chunk, page_prefix.as_bytes()))); + page_prefix = format!("\nPage {page_count}: "); + + yield Ok(Bytes::copy_from_slice(®ex_newline.replace_all(page_chunk, page_prefix.as_bytes()))); page_count += 1; - page_prefix = format!("\nPage {}:{}", page_count, line_prefix_o); + page_prefix = format!("\nPage {page_count}: "); } } } @@ -226,6 +223,7 @@ pub fn postproc_pagebreaks( mod tests { use super::*; use anyhow::Result; + use pretty_assertions::assert_eq; use tokio::pin; use tokio_test::io::Builder; use tokio_test::io::Mock; @@ -236,12 +234,12 @@ mod tests { let mock: Mock = Builder::new() .read(b"Hello\nWorld\x0cFoo Bar\n\x0cTest") .build(); - let res = postproc_pagebreaks("", mock).read_to_end(&mut output).await; + let res = postproc_pagebreaks(mock).read_to_end(&mut output).await; println!("{}", String::from_utf8_lossy(&output)); assert!(matches!(res, Ok(_))); assert_eq!( - output, - b"Page 1:Hello\nPage 1:World\nPage 2:Foo Bar\nPage 2:\nPage 3:Test" + String::from_utf8_lossy(&output), + "Page 1: Hello\nPage 1: World\nPage 2: Foo Bar\nPage 2: \nPage 3: Test" ); } @@ -275,23 +273,14 @@ mod tests { let mut oup = Vec::new(); let inp = postproc_encoding("", a)?; if pagebreaks { - postproc_pagebreaks(line_prefix, inp) - .read_to_end(&mut oup) - .await?; + postproc_pagebreaks(inp).read_to_end(&mut oup).await?; } else { let x = postproc_prefix(line_prefix, inp); pin!(x); x.read_to_end(&mut oup).await?; } let c = String::from_utf8_lossy(&oup); - if b != c { - anyhow::bail!( - "`{}`\nshould be\n`{}`\nbut is\n`{}`", - String::from_utf8_lossy(a), - b, - c - ); - } + assert_eq!(c, b, "source: {}", String::from_utf8_lossy(a)); Ok(()) } @@ -299,14 +288,14 @@ mod tests { #[tokio::test] async fn post1() -> Result<()> { let inp = "What is this\nThis is a test\nFoo"; - let oup = "Page 1:What is this\nPage 1:This is a test\nPage 1:Foo"; + let oup = "Page 1: What is this\nPage 1: This is a test\nPage 1: Foo"; test_from_strs(true, "", inp, oup).await?; println!("\n\n\n\n"); let inp = "What is this\nThis is a test\nFoo\x0c\nHelloooo\nHow are you?\x0c\nGreat!"; - let oup = "Page 1:What is this\nPage 1:This is a test\nPage 1:Foo\nPage 2:\nPage 2:Helloooo\nPage 2:How are you?\nPage 3:\nPage 3:Great!"; + let oup = "Page 1: What is this\nPage 1: This is a test\nPage 1: Foo\nPage 2: \nPage 2: Helloooo\nPage 2: How are you?\nPage 3: \nPage 3: Great!"; test_from_strs(true, "", inp, oup).await?; diff --git a/src/adapters/tar.rs b/src/adapters/tar.rs index c99547d..e2aaaf0 100644 --- a/src/adapters/tar.rs +++ b/src/adapters/tar.rs @@ -109,20 +109,20 @@ mod tests { let o = adapted_to_vec(r).await.context("adapted_to_vec")?; assert_eq!( String::from_utf8(o).context("parsing utf8")?, - "PREFIX:dir/file-b.pdf: Page 1:hello world -PREFIX:dir/file-b.pdf: Page 1:this is just a test. -PREFIX:dir/file-b.pdf: Page 1: -PREFIX:dir/file-b.pdf: Page 1:1 -PREFIX:dir/file-b.pdf: Page 1: -PREFIX:dir/file-b.pdf: Page 1: -PREFIX:dir/file-b.pdf: Page 2: -PREFIX:dir/file-a.pdf: Page 1:hello world -PREFIX:dir/file-a.pdf: Page 1:this is just a test. -PREFIX:dir/file-a.pdf: Page 1: -PREFIX:dir/file-a.pdf: Page 1:1 -PREFIX:dir/file-a.pdf: Page 1: -PREFIX:dir/file-a.pdf: Page 1: -PREFIX:dir/file-a.pdf: Page 2: + "PREFIX:dir/file-b.pdf: Page 1: hello world +PREFIX:dir/file-b.pdf: Page 1: this is just a test. +PREFIX:dir/file-b.pdf: Page 1: +PREFIX:dir/file-b.pdf: Page 1: 1 +PREFIX:dir/file-b.pdf: Page 1: +PREFIX:dir/file-b.pdf: Page 1: +PREFIX:dir/file-b.pdf: Page 2: +PREFIX:dir/file-a.pdf: Page 1: hello world +PREFIX:dir/file-a.pdf: Page 1: this is just a test. +PREFIX:dir/file-a.pdf: Page 1: +PREFIX:dir/file-a.pdf: Page 1: 1 +PREFIX:dir/file-a.pdf: Page 1: +PREFIX:dir/file-a.pdf: Page 1: +PREFIX:dir/file-a.pdf: Page 2: " ); Ok(())