kinda implement postproc_encoding

This commit is contained in:
phiresky 2023-02-21 10:32:53 +01:00
parent a8386a2575
commit 9d9b34f9cc
2 changed files with 39 additions and 34 deletions

View File

@ -5,6 +5,9 @@
use anyhow::Result;
use async_stream::stream;
use bytes::Bytes;
use encoding_rs::Encoding;
use encoding_rs_io::DecodeReaderBytesBuilder;
use tokio_util::io::SyncIoBridge;
use std::io::Cursor;
use std::path::PathBuf;
@ -49,7 +52,7 @@ impl FileAdapter for PostprocPrefix {
) -> Result<AdaptedFilesIterBox> {
let read = add_newline(postproc_prefix(
&a.line_prefix,
postproc_encoding(&a.line_prefix, a.inp)?,
postproc_encoding(&a.line_prefix, a.inp).await?,
));
// keep adapt info (filename etc) except replace inp
let ai = AdaptInfo {
@ -74,50 +77,52 @@ impl Read for ReadErr {
* Detects and converts encodings other than utf-8 to utf-8.
* If the input stream does not contain valid text, returns the string `[rga: binary data]` instead
*/
pub fn postproc_encoding(
async fn postproc_encoding(
_line_prefix: &str,
inp: impl AsyncRead + Send + 'static,
inp: Pin<Box<dyn AsyncRead + Send>>,
) -> Result<Pin<Box<dyn AsyncRead + Send>>> {
Ok(Box::pin(inp))
// panic!("todo: implement");
/*// TODO: parse these options from ripgrep's configuration
let encoding = None; // detect bom but usually assume utf8
let bom_sniffing = true;
let mut decode_builder = DecodeReaderBytesBuilder::new();
// https://github.com/BurntSushi/ripgrep/blob/a7d26c8f144a4957b75f71087a66692d0b25759a/grep-searcher/src/searcher/mod.rs#L706
// this detects utf-16 BOMs and transcodes to utf-8 if they are present
// it does not detect any other char encodings. that would require https://github.com/hsivonen/chardetng or similar but then binary detection is hard (?)
let inp = decode_builder
.encoding(encoding)
.utf8_passthru(true)
.strip_bom(bom_sniffing)
.bom_override(true)
.bom_sniffing(bom_sniffing)
.build(inp);
// check for binary content in first 8kB
// read the first 8kB into a buffer, check for null bytes, then return the buffer concatenated with the rest of the file
let mut fourk = Vec::with_capacity(1 << 13);
let mut beginning = inp.take(1 << 13);
beginning.read_to_end(&mut fourk)?;
beginning.read_to_end(&mut fourk).await?;
if fourk.contains(&0u8) {
log::debug!("detected binary");
let v = "[rga: binary data]";
return Ok(Box::new(std::io::Cursor::new(v)));
/*let err = std::io::Error::new(
std::io::ErrorKind::InvalidData,
format!("{}[rga: binary data]", line_prefix),
);
return Err(err).context("");
return ReadErr {
err,
};*/
return Ok(Box::pin(std::io::Cursor::new(v)));
}
let enc = Encoding::for_bom(&fourk);
let inp = std::io::Cursor::new(fourk).chain(beginning.into_inner());
match enc {
None => Ok(Box::pin(inp)),
Some((enc, _)) if enc == encoding_rs::UTF_8 => Ok(Box::pin(inp)),
Some(_) => {
// detected UTF16LE or UTF16BE, convert to UTF8 in separate thread
// TODO: parse these options from ripgrep's configuration
let encoding = None; // detect bom but usually assume utf8
let bom_sniffing = true;
let mut decode_builder = DecodeReaderBytesBuilder::new();
// https://github.com/BurntSushi/ripgrep/blob/a7d26c8f144a4957b75f71087a66692d0b25759a/grep-searcher/src/searcher/mod.rs#L706
// this detects utf-16 BOMs and transcodes to utf-8 if they are present
// it does not detect any other char encodings. that would require https://github.com/hsivonen/chardetng or similar but then binary detection is hard (?)
let mut inp = decode_builder
.encoding(encoding)
.utf8_passthru(true)
.strip_bom(bom_sniffing)
.bom_override(true)
.bom_sniffing(bom_sniffing)
.build(SyncIoBridge::new(inp));
let oup = tokio::task::spawn_blocking(move || -> Result<Vec<u8>> {
let mut oup = Vec::new();
std::io::Read::read_to_end(&mut inp, &mut oup)?;
Ok(oup)
})
.await??;
Ok(Box::pin(std::io::Cursor::new(oup)))
}
}
Ok(Box::new(
std::io::Cursor::new(fourk).chain(beginning.into_inner()),
))*/
}
/// Adds the given prefix to each line in an `AsyncRead`.

View File

@ -227,7 +227,7 @@ pub fn loop_adapt(
match buf_choose_adapter(file?).await? {
Ret::Recurse(ai, adapter, detection_reason, _active_adapters) => {
if ai.archive_recursion_depth >= ai.config.max_archive_recursion.0 {
let s = format!("{}[rga: max archive recursion reached ({})]", ai.line_prefix, ai.archive_recursion_depth).into_bytes();
let s = format!("{}[rga: max archive recursion reached ({})]\n", ai.line_prefix, ai.archive_recursion_depth).into_bytes();
yield Ok(AdaptInfo {
inp: Box::pin(Cursor::new(s)),
..ai