mirror of
https://github.com/FliegendeWurst/ripgrep-all.git
synced 2024-11-24 12:24:56 +00:00
simplify pagebreaks and add one space
This commit is contained in:
parent
2d28651bcb
commit
96ebcdef27
@ -315,13 +315,13 @@ mod test {
|
|||||||
let o = adapted_to_vec(r).await?;
|
let o = adapted_to_vec(r).await?;
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
String::from_utf8(o)?,
|
String::from_utf8(o)?,
|
||||||
"PREFIX:Page 1:hello world
|
"PREFIX:Page 1: hello world
|
||||||
PREFIX:Page 1:this is just a test.
|
PREFIX:Page 1: this is just a test.
|
||||||
PREFIX:Page 1:
|
PREFIX:Page 1:
|
||||||
PREFIX:Page 1:1
|
PREFIX:Page 1: 1
|
||||||
PREFIX:Page 1:
|
PREFIX:Page 1:
|
||||||
PREFIX:Page 1:
|
PREFIX:Page 1:
|
||||||
PREFIX:Page 2:
|
PREFIX:Page 2:
|
||||||
"
|
"
|
||||||
);
|
);
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -154,13 +154,13 @@ mod tests {
|
|||||||
let o = adapted_to_vec(r).await?;
|
let o = adapted_to_vec(r).await?;
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
String::from_utf8(o)?,
|
String::from_utf8(o)?,
|
||||||
"PREFIX:Page 1:hello world
|
"PREFIX:Page 1: hello world
|
||||||
PREFIX:Page 1:this is just a test.
|
PREFIX:Page 1: this is just a test.
|
||||||
PREFIX:Page 1:
|
PREFIX:Page 1:
|
||||||
PREFIX:Page 1:1
|
PREFIX:Page 1: 1
|
||||||
PREFIX:Page 1:
|
PREFIX:Page 1:
|
||||||
PREFIX:Page 1:
|
PREFIX:Page 1:
|
||||||
PREFIX:Page 2:
|
PREFIX:Page 2:
|
||||||
"
|
"
|
||||||
);
|
);
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -170,7 +170,7 @@ impl FileAdapter for PostprocPageBreaks {
|
|||||||
a: super::AdaptInfo,
|
a: super::AdaptInfo,
|
||||||
_detection_reason: &crate::matching::FileMatcher,
|
_detection_reason: &crate::matching::FileMatcher,
|
||||||
) -> Result<AdaptedFilesIterBox> {
|
) -> Result<AdaptedFilesIterBox> {
|
||||||
let read = postproc_pagebreaks("", postproc_encoding(&a.line_prefix, a.inp)?);
|
let read = postproc_pagebreaks(postproc_encoding(&a.line_prefix, a.inp)?);
|
||||||
// keep adapt info (filename etc) except replace inp
|
// keep adapt info (filename etc) except replace inp
|
||||||
let ai = AdaptInfo {
|
let ai = AdaptInfo {
|
||||||
inp: Box::pin(read),
|
inp: Box::pin(read),
|
||||||
@ -187,33 +187,30 @@ impl FileAdapter for PostprocPageBreaks {
|
|||||||
Ok(one_file(ai))
|
Ok(one_file(ai))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/// Adds the prefix "Page N:" to each line,
|
/// Adds the prefix "Page N: " to each line,
|
||||||
/// where N starts at one and is incremented for each ASCII Form Feed character in the input stream.
|
/// where N starts at one and is incremented for each ASCII Form Feed character in the input stream.
|
||||||
/// ASCII form feeds are the page delimiters output by `pdftotext`.
|
/// ASCII form feeds are the page delimiters output by `pdftotext`.
|
||||||
pub fn postproc_pagebreaks(
|
pub fn postproc_pagebreaks(input: impl AsyncRead + Send) -> impl AsyncRead + Send {
|
||||||
line_prefix: &str,
|
|
||||||
input: impl AsyncRead + Send,
|
|
||||||
) -> impl AsyncRead + Send {
|
|
||||||
let line_prefix_o: String = line_prefix.into();
|
|
||||||
let regex_linefeed = regex::bytes::Regex::new(r"\x0c").unwrap();
|
let regex_linefeed = regex::bytes::Regex::new(r"\x0c").unwrap();
|
||||||
let regex_newline = regex::bytes::Regex::new("\n").unwrap();
|
let regex_newline = regex::bytes::Regex::new("\n").unwrap();
|
||||||
let mut page_count: i32 = 1;
|
let mut page_count: i32 = 1;
|
||||||
let mut page_prefix: String = format!("Page {page_count}:{line_prefix_o}");
|
let mut page_prefix: String = format!("Page {page_count}: ");
|
||||||
|
|
||||||
let input_stream = ReaderStream::new(input);
|
let input_stream = ReaderStream::new(input);
|
||||||
let output_stream = stream! {
|
let output_stream = stream! {
|
||||||
for await chunk in input_stream {
|
for await read_chunk in input_stream {
|
||||||
match chunk {
|
match read_chunk {
|
||||||
Err(e) => yield Err(e),
|
Err(e) => yield Err(e),
|
||||||
Ok(chunk) => {
|
Ok(chunk) => {
|
||||||
let sub_chunks = regex_linefeed.split(&chunk);
|
let page_chunks = regex_linefeed.split(&chunk);
|
||||||
for sub_chunk in sub_chunks {
|
for page_chunk in page_chunks {
|
||||||
// println!("{}", String::from_utf8_lossy(page_prefix.as_bytes()));
|
// println!("{}", String::from_utf8_lossy(page_prefix.as_bytes()));
|
||||||
yield Ok(Bytes::copy_from_slice(page_prefix.as_bytes()));
|
yield Ok(Bytes::copy_from_slice(page_prefix.as_bytes()));
|
||||||
page_prefix = format!("\nPage {}:{}", page_count, line_prefix_o);
|
page_prefix = format!("\nPage {page_count}: ");
|
||||||
yield Ok(Bytes::copy_from_slice(®ex_newline.replace_all(sub_chunk, page_prefix.as_bytes())));
|
|
||||||
|
yield Ok(Bytes::copy_from_slice(®ex_newline.replace_all(page_chunk, page_prefix.as_bytes())));
|
||||||
page_count += 1;
|
page_count += 1;
|
||||||
page_prefix = format!("\nPage {}:{}", page_count, line_prefix_o);
|
page_prefix = format!("\nPage {page_count}: ");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -226,6 +223,7 @@ pub fn postproc_pagebreaks(
|
|||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
|
use pretty_assertions::assert_eq;
|
||||||
use tokio::pin;
|
use tokio::pin;
|
||||||
use tokio_test::io::Builder;
|
use tokio_test::io::Builder;
|
||||||
use tokio_test::io::Mock;
|
use tokio_test::io::Mock;
|
||||||
@ -236,12 +234,12 @@ mod tests {
|
|||||||
let mock: Mock = Builder::new()
|
let mock: Mock = Builder::new()
|
||||||
.read(b"Hello\nWorld\x0cFoo Bar\n\x0cTest")
|
.read(b"Hello\nWorld\x0cFoo Bar\n\x0cTest")
|
||||||
.build();
|
.build();
|
||||||
let res = postproc_pagebreaks("", mock).read_to_end(&mut output).await;
|
let res = postproc_pagebreaks(mock).read_to_end(&mut output).await;
|
||||||
println!("{}", String::from_utf8_lossy(&output));
|
println!("{}", String::from_utf8_lossy(&output));
|
||||||
assert!(matches!(res, Ok(_)));
|
assert!(matches!(res, Ok(_)));
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
output,
|
String::from_utf8_lossy(&output),
|
||||||
b"Page 1:Hello\nPage 1:World\nPage 2:Foo Bar\nPage 2:\nPage 3:Test"
|
"Page 1: Hello\nPage 1: World\nPage 2: Foo Bar\nPage 2: \nPage 3: Test"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -275,23 +273,14 @@ mod tests {
|
|||||||
let mut oup = Vec::new();
|
let mut oup = Vec::new();
|
||||||
let inp = postproc_encoding("", a)?;
|
let inp = postproc_encoding("", a)?;
|
||||||
if pagebreaks {
|
if pagebreaks {
|
||||||
postproc_pagebreaks(line_prefix, inp)
|
postproc_pagebreaks(inp).read_to_end(&mut oup).await?;
|
||||||
.read_to_end(&mut oup)
|
|
||||||
.await?;
|
|
||||||
} else {
|
} else {
|
||||||
let x = postproc_prefix(line_prefix, inp);
|
let x = postproc_prefix(line_prefix, inp);
|
||||||
pin!(x);
|
pin!(x);
|
||||||
x.read_to_end(&mut oup).await?;
|
x.read_to_end(&mut oup).await?;
|
||||||
}
|
}
|
||||||
let c = String::from_utf8_lossy(&oup);
|
let c = String::from_utf8_lossy(&oup);
|
||||||
if b != c {
|
assert_eq!(c, b, "source: {}", String::from_utf8_lossy(a));
|
||||||
anyhow::bail!(
|
|
||||||
"`{}`\nshould be\n`{}`\nbut is\n`{}`",
|
|
||||||
String::from_utf8_lossy(a),
|
|
||||||
b,
|
|
||||||
c
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@ -299,14 +288,14 @@ mod tests {
|
|||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn post1() -> Result<()> {
|
async fn post1() -> Result<()> {
|
||||||
let inp = "What is this\nThis is a test\nFoo";
|
let inp = "What is this\nThis is a test\nFoo";
|
||||||
let oup = "Page 1:What is this\nPage 1:This is a test\nPage 1:Foo";
|
let oup = "Page 1: What is this\nPage 1: This is a test\nPage 1: Foo";
|
||||||
|
|
||||||
test_from_strs(true, "", inp, oup).await?;
|
test_from_strs(true, "", inp, oup).await?;
|
||||||
|
|
||||||
println!("\n\n\n\n");
|
println!("\n\n\n\n");
|
||||||
|
|
||||||
let inp = "What is this\nThis is a test\nFoo\x0c\nHelloooo\nHow are you?\x0c\nGreat!";
|
let inp = "What is this\nThis is a test\nFoo\x0c\nHelloooo\nHow are you?\x0c\nGreat!";
|
||||||
let oup = "Page 1:What is this\nPage 1:This is a test\nPage 1:Foo\nPage 2:\nPage 2:Helloooo\nPage 2:How are you?\nPage 3:\nPage 3:Great!";
|
let oup = "Page 1: What is this\nPage 1: This is a test\nPage 1: Foo\nPage 2: \nPage 2: Helloooo\nPage 2: How are you?\nPage 3: \nPage 3: Great!";
|
||||||
|
|
||||||
test_from_strs(true, "", inp, oup).await?;
|
test_from_strs(true, "", inp, oup).await?;
|
||||||
|
|
||||||
|
@ -109,20 +109,20 @@ mod tests {
|
|||||||
let o = adapted_to_vec(r).await.context("adapted_to_vec")?;
|
let o = adapted_to_vec(r).await.context("adapted_to_vec")?;
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
String::from_utf8(o).context("parsing utf8")?,
|
String::from_utf8(o).context("parsing utf8")?,
|
||||||
"PREFIX:dir/file-b.pdf: Page 1:hello world
|
"PREFIX:dir/file-b.pdf: Page 1: hello world
|
||||||
PREFIX:dir/file-b.pdf: Page 1:this is just a test.
|
PREFIX:dir/file-b.pdf: Page 1: this is just a test.
|
||||||
PREFIX:dir/file-b.pdf: Page 1:
|
PREFIX:dir/file-b.pdf: Page 1:
|
||||||
PREFIX:dir/file-b.pdf: Page 1:1
|
PREFIX:dir/file-b.pdf: Page 1: 1
|
||||||
PREFIX:dir/file-b.pdf: Page 1:
|
PREFIX:dir/file-b.pdf: Page 1:
|
||||||
PREFIX:dir/file-b.pdf: Page 1:
|
PREFIX:dir/file-b.pdf: Page 1:
|
||||||
PREFIX:dir/file-b.pdf: Page 2:
|
PREFIX:dir/file-b.pdf: Page 2:
|
||||||
PREFIX:dir/file-a.pdf: Page 1:hello world
|
PREFIX:dir/file-a.pdf: Page 1: hello world
|
||||||
PREFIX:dir/file-a.pdf: Page 1:this is just a test.
|
PREFIX:dir/file-a.pdf: Page 1: this is just a test.
|
||||||
PREFIX:dir/file-a.pdf: Page 1:
|
PREFIX:dir/file-a.pdf: Page 1:
|
||||||
PREFIX:dir/file-a.pdf: Page 1:1
|
PREFIX:dir/file-a.pdf: Page 1: 1
|
||||||
PREFIX:dir/file-a.pdf: Page 1:
|
PREFIX:dir/file-a.pdf: Page 1:
|
||||||
PREFIX:dir/file-a.pdf: Page 1:
|
PREFIX:dir/file-a.pdf: Page 1:
|
||||||
PREFIX:dir/file-a.pdf: Page 2:
|
PREFIX:dir/file-a.pdf: Page 2:
|
||||||
"
|
"
|
||||||
);
|
);
|
||||||
Ok(())
|
Ok(())
|
||||||
|
Loading…
Reference in New Issue
Block a user