simplify pagebreaks and add one space

This commit is contained in:
phiresky 2022-12-26 19:28:56 +01:00
parent 2d28651bcb
commit 96ebcdef27
4 changed files with 48 additions and 59 deletions

View File

@ -315,10 +315,10 @@ mod test {
let o = adapted_to_vec(r).await?; let o = adapted_to_vec(r).await?;
assert_eq!( assert_eq!(
String::from_utf8(o)?, String::from_utf8(o)?,
"PREFIX:Page 1:hello world "PREFIX:Page 1: hello world
PREFIX:Page 1:this is just a test. PREFIX:Page 1: this is just a test.
PREFIX:Page 1: PREFIX:Page 1:
PREFIX:Page 1:1 PREFIX:Page 1: 1
PREFIX:Page 1: PREFIX:Page 1:
PREFIX:Page 1: PREFIX:Page 1:
PREFIX:Page 2: PREFIX:Page 2:

View File

@ -154,10 +154,10 @@ mod tests {
let o = adapted_to_vec(r).await?; let o = adapted_to_vec(r).await?;
assert_eq!( assert_eq!(
String::from_utf8(o)?, String::from_utf8(o)?,
"PREFIX:Page 1:hello world "PREFIX:Page 1: hello world
PREFIX:Page 1:this is just a test. PREFIX:Page 1: this is just a test.
PREFIX:Page 1: PREFIX:Page 1:
PREFIX:Page 1:1 PREFIX:Page 1: 1
PREFIX:Page 1: PREFIX:Page 1:
PREFIX:Page 1: PREFIX:Page 1:
PREFIX:Page 2: PREFIX:Page 2:

View File

@ -170,7 +170,7 @@ impl FileAdapter for PostprocPageBreaks {
a: super::AdaptInfo, a: super::AdaptInfo,
_detection_reason: &crate::matching::FileMatcher, _detection_reason: &crate::matching::FileMatcher,
) -> Result<AdaptedFilesIterBox> { ) -> Result<AdaptedFilesIterBox> {
let read = postproc_pagebreaks("", postproc_encoding(&a.line_prefix, a.inp)?); let read = postproc_pagebreaks(postproc_encoding(&a.line_prefix, a.inp)?);
// keep adapt info (filename etc) except replace inp // keep adapt info (filename etc) except replace inp
let ai = AdaptInfo { let ai = AdaptInfo {
inp: Box::pin(read), inp: Box::pin(read),
@ -187,33 +187,30 @@ impl FileAdapter for PostprocPageBreaks {
Ok(one_file(ai)) Ok(one_file(ai))
} }
} }
/// Adds the prefix "Page N:" to each line, /// Adds the prefix "Page N: " to each line,
/// where N starts at one and is incremented for each ASCII Form Feed character in the input stream. /// where N starts at one and is incremented for each ASCII Form Feed character in the input stream.
/// ASCII form feeds are the page delimiters output by `pdftotext`. /// ASCII form feeds are the page delimiters output by `pdftotext`.
pub fn postproc_pagebreaks( pub fn postproc_pagebreaks(input: impl AsyncRead + Send) -> impl AsyncRead + Send {
line_prefix: &str,
input: impl AsyncRead + Send,
) -> impl AsyncRead + Send {
let line_prefix_o: String = line_prefix.into();
let regex_linefeed = regex::bytes::Regex::new(r"\x0c").unwrap(); let regex_linefeed = regex::bytes::Regex::new(r"\x0c").unwrap();
let regex_newline = regex::bytes::Regex::new("\n").unwrap(); let regex_newline = regex::bytes::Regex::new("\n").unwrap();
let mut page_count: i32 = 1; let mut page_count: i32 = 1;
let mut page_prefix: String = format!("Page {page_count}:{line_prefix_o}"); let mut page_prefix: String = format!("Page {page_count}: ");
let input_stream = ReaderStream::new(input); let input_stream = ReaderStream::new(input);
let output_stream = stream! { let output_stream = stream! {
for await chunk in input_stream { for await read_chunk in input_stream {
match chunk { match read_chunk {
Err(e) => yield Err(e), Err(e) => yield Err(e),
Ok(chunk) => { Ok(chunk) => {
let sub_chunks = regex_linefeed.split(&chunk); let page_chunks = regex_linefeed.split(&chunk);
for sub_chunk in sub_chunks { for page_chunk in page_chunks {
// println!("{}", String::from_utf8_lossy(page_prefix.as_bytes())); // println!("{}", String::from_utf8_lossy(page_prefix.as_bytes()));
yield Ok(Bytes::copy_from_slice(page_prefix.as_bytes())); yield Ok(Bytes::copy_from_slice(page_prefix.as_bytes()));
page_prefix = format!("\nPage {}:{}", page_count, line_prefix_o); page_prefix = format!("\nPage {page_count}: ");
yield Ok(Bytes::copy_from_slice(&regex_newline.replace_all(sub_chunk, page_prefix.as_bytes())));
yield Ok(Bytes::copy_from_slice(&regex_newline.replace_all(page_chunk, page_prefix.as_bytes())));
page_count += 1; page_count += 1;
page_prefix = format!("\nPage {}:{}", page_count, line_prefix_o); page_prefix = format!("\nPage {page_count}: ");
} }
} }
} }
@ -226,6 +223,7 @@ pub fn postproc_pagebreaks(
mod tests { mod tests {
use super::*; use super::*;
use anyhow::Result; use anyhow::Result;
use pretty_assertions::assert_eq;
use tokio::pin; use tokio::pin;
use tokio_test::io::Builder; use tokio_test::io::Builder;
use tokio_test::io::Mock; use tokio_test::io::Mock;
@ -236,12 +234,12 @@ mod tests {
let mock: Mock = Builder::new() let mock: Mock = Builder::new()
.read(b"Hello\nWorld\x0cFoo Bar\n\x0cTest") .read(b"Hello\nWorld\x0cFoo Bar\n\x0cTest")
.build(); .build();
let res = postproc_pagebreaks("", mock).read_to_end(&mut output).await; let res = postproc_pagebreaks(mock).read_to_end(&mut output).await;
println!("{}", String::from_utf8_lossy(&output)); println!("{}", String::from_utf8_lossy(&output));
assert!(matches!(res, Ok(_))); assert!(matches!(res, Ok(_)));
assert_eq!( assert_eq!(
output, String::from_utf8_lossy(&output),
b"Page 1:Hello\nPage 1:World\nPage 2:Foo Bar\nPage 2:\nPage 3:Test" "Page 1: Hello\nPage 1: World\nPage 2: Foo Bar\nPage 2: \nPage 3: Test"
); );
} }
@ -275,23 +273,14 @@ mod tests {
let mut oup = Vec::new(); let mut oup = Vec::new();
let inp = postproc_encoding("", a)?; let inp = postproc_encoding("", a)?;
if pagebreaks { if pagebreaks {
postproc_pagebreaks(line_prefix, inp) postproc_pagebreaks(inp).read_to_end(&mut oup).await?;
.read_to_end(&mut oup)
.await?;
} else { } else {
let x = postproc_prefix(line_prefix, inp); let x = postproc_prefix(line_prefix, inp);
pin!(x); pin!(x);
x.read_to_end(&mut oup).await?; x.read_to_end(&mut oup).await?;
} }
let c = String::from_utf8_lossy(&oup); let c = String::from_utf8_lossy(&oup);
if b != c { assert_eq!(c, b, "source: {}", String::from_utf8_lossy(a));
anyhow::bail!(
"`{}`\nshould be\n`{}`\nbut is\n`{}`",
String::from_utf8_lossy(a),
b,
c
);
}
Ok(()) Ok(())
} }
@ -299,14 +288,14 @@ mod tests {
#[tokio::test] #[tokio::test]
async fn post1() -> Result<()> { async fn post1() -> Result<()> {
let inp = "What is this\nThis is a test\nFoo"; let inp = "What is this\nThis is a test\nFoo";
let oup = "Page 1:What is this\nPage 1:This is a test\nPage 1:Foo"; let oup = "Page 1: What is this\nPage 1: This is a test\nPage 1: Foo";
test_from_strs(true, "", inp, oup).await?; test_from_strs(true, "", inp, oup).await?;
println!("\n\n\n\n"); println!("\n\n\n\n");
let inp = "What is this\nThis is a test\nFoo\x0c\nHelloooo\nHow are you?\x0c\nGreat!"; let inp = "What is this\nThis is a test\nFoo\x0c\nHelloooo\nHow are you?\x0c\nGreat!";
let oup = "Page 1:What is this\nPage 1:This is a test\nPage 1:Foo\nPage 2:\nPage 2:Helloooo\nPage 2:How are you?\nPage 3:\nPage 3:Great!"; let oup = "Page 1: What is this\nPage 1: This is a test\nPage 1: Foo\nPage 2: \nPage 2: Helloooo\nPage 2: How are you?\nPage 3: \nPage 3: Great!";
test_from_strs(true, "", inp, oup).await?; test_from_strs(true, "", inp, oup).await?;

View File

@ -109,17 +109,17 @@ mod tests {
let o = adapted_to_vec(r).await.context("adapted_to_vec")?; let o = adapted_to_vec(r).await.context("adapted_to_vec")?;
assert_eq!( assert_eq!(
String::from_utf8(o).context("parsing utf8")?, String::from_utf8(o).context("parsing utf8")?,
"PREFIX:dir/file-b.pdf: Page 1:hello world "PREFIX:dir/file-b.pdf: Page 1: hello world
PREFIX:dir/file-b.pdf: Page 1:this is just a test. PREFIX:dir/file-b.pdf: Page 1: this is just a test.
PREFIX:dir/file-b.pdf: Page 1: PREFIX:dir/file-b.pdf: Page 1:
PREFIX:dir/file-b.pdf: Page 1:1 PREFIX:dir/file-b.pdf: Page 1: 1
PREFIX:dir/file-b.pdf: Page 1: PREFIX:dir/file-b.pdf: Page 1:
PREFIX:dir/file-b.pdf: Page 1: PREFIX:dir/file-b.pdf: Page 1:
PREFIX:dir/file-b.pdf: Page 2: PREFIX:dir/file-b.pdf: Page 2:
PREFIX:dir/file-a.pdf: Page 1:hello world PREFIX:dir/file-a.pdf: Page 1: hello world
PREFIX:dir/file-a.pdf: Page 1:this is just a test. PREFIX:dir/file-a.pdf: Page 1: this is just a test.
PREFIX:dir/file-a.pdf: Page 1: PREFIX:dir/file-a.pdf: Page 1:
PREFIX:dir/file-a.pdf: Page 1:1 PREFIX:dir/file-a.pdf: Page 1: 1
PREFIX:dir/file-a.pdf: Page 1: PREFIX:dir/file-a.pdf: Page 1:
PREFIX:dir/file-a.pdf: Page 1: PREFIX:dir/file-a.pdf: Page 1:
PREFIX:dir/file-a.pdf: Page 2: PREFIX:dir/file-a.pdf: Page 2: