mirror of
https://github.com/FliegendeWurst/ripgrep-all.git
synced 2024-11-10 06:50:38 +00:00
simplify pagebreaks and add one space
This commit is contained in:
parent
2d28651bcb
commit
96ebcdef27
@ -315,13 +315,13 @@ mod test {
|
||||
let o = adapted_to_vec(r).await?;
|
||||
assert_eq!(
|
||||
String::from_utf8(o)?,
|
||||
"PREFIX:Page 1:hello world
|
||||
PREFIX:Page 1:this is just a test.
|
||||
PREFIX:Page 1:
|
||||
PREFIX:Page 1:1
|
||||
PREFIX:Page 1:
|
||||
PREFIX:Page 1:
|
||||
PREFIX:Page 2:
|
||||
"PREFIX:Page 1: hello world
|
||||
PREFIX:Page 1: this is just a test.
|
||||
PREFIX:Page 1:
|
||||
PREFIX:Page 1: 1
|
||||
PREFIX:Page 1:
|
||||
PREFIX:Page 1:
|
||||
PREFIX:Page 2:
|
||||
"
|
||||
);
|
||||
Ok(())
|
||||
|
@ -154,13 +154,13 @@ mod tests {
|
||||
let o = adapted_to_vec(r).await?;
|
||||
assert_eq!(
|
||||
String::from_utf8(o)?,
|
||||
"PREFIX:Page 1:hello world
|
||||
PREFIX:Page 1:this is just a test.
|
||||
PREFIX:Page 1:
|
||||
PREFIX:Page 1:1
|
||||
PREFIX:Page 1:
|
||||
PREFIX:Page 1:
|
||||
PREFIX:Page 2:
|
||||
"PREFIX:Page 1: hello world
|
||||
PREFIX:Page 1: this is just a test.
|
||||
PREFIX:Page 1:
|
||||
PREFIX:Page 1: 1
|
||||
PREFIX:Page 1:
|
||||
PREFIX:Page 1:
|
||||
PREFIX:Page 2:
|
||||
"
|
||||
);
|
||||
Ok(())
|
||||
|
@ -170,7 +170,7 @@ impl FileAdapter for PostprocPageBreaks {
|
||||
a: super::AdaptInfo,
|
||||
_detection_reason: &crate::matching::FileMatcher,
|
||||
) -> Result<AdaptedFilesIterBox> {
|
||||
let read = postproc_pagebreaks("", postproc_encoding(&a.line_prefix, a.inp)?);
|
||||
let read = postproc_pagebreaks(postproc_encoding(&a.line_prefix, a.inp)?);
|
||||
// keep adapt info (filename etc) except replace inp
|
||||
let ai = AdaptInfo {
|
||||
inp: Box::pin(read),
|
||||
@ -187,33 +187,30 @@ impl FileAdapter for PostprocPageBreaks {
|
||||
Ok(one_file(ai))
|
||||
}
|
||||
}
|
||||
/// Adds the prefix "Page N:" to each line,
|
||||
/// Adds the prefix "Page N: " to each line,
|
||||
/// where N starts at one and is incremented for each ASCII Form Feed character in the input stream.
|
||||
/// ASCII form feeds are the page delimiters output by `pdftotext`.
|
||||
pub fn postproc_pagebreaks(
|
||||
line_prefix: &str,
|
||||
input: impl AsyncRead + Send,
|
||||
) -> impl AsyncRead + Send {
|
||||
let line_prefix_o: String = line_prefix.into();
|
||||
pub fn postproc_pagebreaks(input: impl AsyncRead + Send) -> impl AsyncRead + Send {
|
||||
let regex_linefeed = regex::bytes::Regex::new(r"\x0c").unwrap();
|
||||
let regex_newline = regex::bytes::Regex::new("\n").unwrap();
|
||||
let mut page_count: i32 = 1;
|
||||
let mut page_prefix: String = format!("Page {page_count}:{line_prefix_o}");
|
||||
let mut page_prefix: String = format!("Page {page_count}: ");
|
||||
|
||||
let input_stream = ReaderStream::new(input);
|
||||
let output_stream = stream! {
|
||||
for await chunk in input_stream {
|
||||
match chunk {
|
||||
for await read_chunk in input_stream {
|
||||
match read_chunk {
|
||||
Err(e) => yield Err(e),
|
||||
Ok(chunk) => {
|
||||
let sub_chunks = regex_linefeed.split(&chunk);
|
||||
for sub_chunk in sub_chunks {
|
||||
let page_chunks = regex_linefeed.split(&chunk);
|
||||
for page_chunk in page_chunks {
|
||||
// println!("{}", String::from_utf8_lossy(page_prefix.as_bytes()));
|
||||
yield Ok(Bytes::copy_from_slice(page_prefix.as_bytes()));
|
||||
page_prefix = format!("\nPage {}:{}", page_count, line_prefix_o);
|
||||
yield Ok(Bytes::copy_from_slice(®ex_newline.replace_all(sub_chunk, page_prefix.as_bytes())));
|
||||
page_prefix = format!("\nPage {page_count}: ");
|
||||
|
||||
yield Ok(Bytes::copy_from_slice(®ex_newline.replace_all(page_chunk, page_prefix.as_bytes())));
|
||||
page_count += 1;
|
||||
page_prefix = format!("\nPage {}:{}", page_count, line_prefix_o);
|
||||
page_prefix = format!("\nPage {page_count}: ");
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -226,6 +223,7 @@ pub fn postproc_pagebreaks(
|
||||
mod tests {
|
||||
use super::*;
|
||||
use anyhow::Result;
|
||||
use pretty_assertions::assert_eq;
|
||||
use tokio::pin;
|
||||
use tokio_test::io::Builder;
|
||||
use tokio_test::io::Mock;
|
||||
@ -236,12 +234,12 @@ mod tests {
|
||||
let mock: Mock = Builder::new()
|
||||
.read(b"Hello\nWorld\x0cFoo Bar\n\x0cTest")
|
||||
.build();
|
||||
let res = postproc_pagebreaks("", mock).read_to_end(&mut output).await;
|
||||
let res = postproc_pagebreaks(mock).read_to_end(&mut output).await;
|
||||
println!("{}", String::from_utf8_lossy(&output));
|
||||
assert!(matches!(res, Ok(_)));
|
||||
assert_eq!(
|
||||
output,
|
||||
b"Page 1:Hello\nPage 1:World\nPage 2:Foo Bar\nPage 2:\nPage 3:Test"
|
||||
String::from_utf8_lossy(&output),
|
||||
"Page 1: Hello\nPage 1: World\nPage 2: Foo Bar\nPage 2: \nPage 3: Test"
|
||||
);
|
||||
}
|
||||
|
||||
@ -275,23 +273,14 @@ mod tests {
|
||||
let mut oup = Vec::new();
|
||||
let inp = postproc_encoding("", a)?;
|
||||
if pagebreaks {
|
||||
postproc_pagebreaks(line_prefix, inp)
|
||||
.read_to_end(&mut oup)
|
||||
.await?;
|
||||
postproc_pagebreaks(inp).read_to_end(&mut oup).await?;
|
||||
} else {
|
||||
let x = postproc_prefix(line_prefix, inp);
|
||||
pin!(x);
|
||||
x.read_to_end(&mut oup).await?;
|
||||
}
|
||||
let c = String::from_utf8_lossy(&oup);
|
||||
if b != c {
|
||||
anyhow::bail!(
|
||||
"`{}`\nshould be\n`{}`\nbut is\n`{}`",
|
||||
String::from_utf8_lossy(a),
|
||||
b,
|
||||
c
|
||||
);
|
||||
}
|
||||
assert_eq!(c, b, "source: {}", String::from_utf8_lossy(a));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@ -299,14 +288,14 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn post1() -> Result<()> {
|
||||
let inp = "What is this\nThis is a test\nFoo";
|
||||
let oup = "Page 1:What is this\nPage 1:This is a test\nPage 1:Foo";
|
||||
let oup = "Page 1: What is this\nPage 1: This is a test\nPage 1: Foo";
|
||||
|
||||
test_from_strs(true, "", inp, oup).await?;
|
||||
|
||||
println!("\n\n\n\n");
|
||||
|
||||
let inp = "What is this\nThis is a test\nFoo\x0c\nHelloooo\nHow are you?\x0c\nGreat!";
|
||||
let oup = "Page 1:What is this\nPage 1:This is a test\nPage 1:Foo\nPage 2:\nPage 2:Helloooo\nPage 2:How are you?\nPage 3:\nPage 3:Great!";
|
||||
let oup = "Page 1: What is this\nPage 1: This is a test\nPage 1: Foo\nPage 2: \nPage 2: Helloooo\nPage 2: How are you?\nPage 3: \nPage 3: Great!";
|
||||
|
||||
test_from_strs(true, "", inp, oup).await?;
|
||||
|
||||
|
@ -109,20 +109,20 @@ mod tests {
|
||||
let o = adapted_to_vec(r).await.context("adapted_to_vec")?;
|
||||
assert_eq!(
|
||||
String::from_utf8(o).context("parsing utf8")?,
|
||||
"PREFIX:dir/file-b.pdf: Page 1:hello world
|
||||
PREFIX:dir/file-b.pdf: Page 1:this is just a test.
|
||||
PREFIX:dir/file-b.pdf: Page 1:
|
||||
PREFIX:dir/file-b.pdf: Page 1:1
|
||||
PREFIX:dir/file-b.pdf: Page 1:
|
||||
PREFIX:dir/file-b.pdf: Page 1:
|
||||
PREFIX:dir/file-b.pdf: Page 2:
|
||||
PREFIX:dir/file-a.pdf: Page 1:hello world
|
||||
PREFIX:dir/file-a.pdf: Page 1:this is just a test.
|
||||
PREFIX:dir/file-a.pdf: Page 1:
|
||||
PREFIX:dir/file-a.pdf: Page 1:1
|
||||
PREFIX:dir/file-a.pdf: Page 1:
|
||||
PREFIX:dir/file-a.pdf: Page 1:
|
||||
PREFIX:dir/file-a.pdf: Page 2:
|
||||
"PREFIX:dir/file-b.pdf: Page 1: hello world
|
||||
PREFIX:dir/file-b.pdf: Page 1: this is just a test.
|
||||
PREFIX:dir/file-b.pdf: Page 1:
|
||||
PREFIX:dir/file-b.pdf: Page 1: 1
|
||||
PREFIX:dir/file-b.pdf: Page 1:
|
||||
PREFIX:dir/file-b.pdf: Page 1:
|
||||
PREFIX:dir/file-b.pdf: Page 2:
|
||||
PREFIX:dir/file-a.pdf: Page 1: hello world
|
||||
PREFIX:dir/file-a.pdf: Page 1: this is just a test.
|
||||
PREFIX:dir/file-a.pdf: Page 1:
|
||||
PREFIX:dir/file-a.pdf: Page 1: 1
|
||||
PREFIX:dir/file-a.pdf: Page 1:
|
||||
PREFIX:dir/file-a.pdf: Page 1:
|
||||
PREFIX:dir/file-a.pdf: Page 2:
|
||||
"
|
||||
);
|
||||
Ok(())
|
||||
|
Loading…
Reference in New Issue
Block a user