kinda implement postproc_encoding

This commit is contained in:
phiresky 2023-02-21 10:32:53 +01:00
parent a8386a2575
commit 9d9b34f9cc
2 changed files with 39 additions and 34 deletions

View File

@ -5,6 +5,9 @@
use anyhow::Result; use anyhow::Result;
use async_stream::stream; use async_stream::stream;
use bytes::Bytes; use bytes::Bytes;
use encoding_rs::Encoding;
use encoding_rs_io::DecodeReaderBytesBuilder;
use tokio_util::io::SyncIoBridge;
use std::io::Cursor; use std::io::Cursor;
use std::path::PathBuf; use std::path::PathBuf;
@ -49,7 +52,7 @@ impl FileAdapter for PostprocPrefix {
) -> Result<AdaptedFilesIterBox> { ) -> Result<AdaptedFilesIterBox> {
let read = add_newline(postproc_prefix( let read = add_newline(postproc_prefix(
&a.line_prefix, &a.line_prefix,
postproc_encoding(&a.line_prefix, a.inp)?, postproc_encoding(&a.line_prefix, a.inp).await?,
)); ));
// keep adapt info (filename etc) except replace inp // keep adapt info (filename etc) except replace inp
let ai = AdaptInfo { let ai = AdaptInfo {
@ -74,50 +77,52 @@ impl Read for ReadErr {
* Detects and converts encodings other than utf-8 to utf-8. * Detects and converts encodings other than utf-8 to utf-8.
* If the input stream does not contain valid text, returns the string `[rga: binary data]` instead * If the input stream does not contain valid text, returns the string `[rga: binary data]` instead
*/ */
pub fn postproc_encoding( async fn postproc_encoding(
_line_prefix: &str, _line_prefix: &str,
inp: impl AsyncRead + Send + 'static, inp: Pin<Box<dyn AsyncRead + Send>>,
) -> Result<Pin<Box<dyn AsyncRead + Send>>> { ) -> Result<Pin<Box<dyn AsyncRead + Send>>> {
Ok(Box::pin(inp))
// panic!("todo: implement");
/*// TODO: parse these options from ripgrep's configuration
let encoding = None; // detect bom but usually assume utf8
let bom_sniffing = true;
let mut decode_builder = DecodeReaderBytesBuilder::new();
// https://github.com/BurntSushi/ripgrep/blob/a7d26c8f144a4957b75f71087a66692d0b25759a/grep-searcher/src/searcher/mod.rs#L706
// this detects utf-16 BOMs and transcodes to utf-8 if they are present
// it does not detect any other char encodings. that would require https://github.com/hsivonen/chardetng or similar but then binary detection is hard (?)
let inp = decode_builder
.encoding(encoding)
.utf8_passthru(true)
.strip_bom(bom_sniffing)
.bom_override(true)
.bom_sniffing(bom_sniffing)
.build(inp);
// check for binary content in first 8kB // check for binary content in first 8kB
// read the first 8kB into a buffer, check for null bytes, then return the buffer concatenated with the rest of the file // read the first 8kB into a buffer, check for null bytes, then return the buffer concatenated with the rest of the file
let mut fourk = Vec::with_capacity(1 << 13); let mut fourk = Vec::with_capacity(1 << 13);
let mut beginning = inp.take(1 << 13); let mut beginning = inp.take(1 << 13);
beginning.read_to_end(&mut fourk)?; beginning.read_to_end(&mut fourk).await?;
if fourk.contains(&0u8) { if fourk.contains(&0u8) {
log::debug!("detected binary"); log::debug!("detected binary");
let v = "[rga: binary data]"; let v = "[rga: binary data]";
return Ok(Box::new(std::io::Cursor::new(v))); return Ok(Box::pin(std::io::Cursor::new(v)));
/*let err = std::io::Error::new( }
std::io::ErrorKind::InvalidData, let enc = Encoding::for_bom(&fourk);
format!("{}[rga: binary data]", line_prefix), let inp = std::io::Cursor::new(fourk).chain(beginning.into_inner());
); match enc {
return Err(err).context(""); None => Ok(Box::pin(inp)),
return ReadErr { Some((enc, _)) if enc == encoding_rs::UTF_8 => Ok(Box::pin(inp)),
err, Some(_) => {
};*/ // detected UTF16LE or UTF16BE, convert to UTF8 in separate thread
// TODO: parse these options from ripgrep's configuration
let encoding = None; // detect bom but usually assume utf8
let bom_sniffing = true;
let mut decode_builder = DecodeReaderBytesBuilder::new();
// https://github.com/BurntSushi/ripgrep/blob/a7d26c8f144a4957b75f71087a66692d0b25759a/grep-searcher/src/searcher/mod.rs#L706
// this detects utf-16 BOMs and transcodes to utf-8 if they are present
// it does not detect any other char encodings. that would require https://github.com/hsivonen/chardetng or similar but then binary detection is hard (?)
let mut inp = decode_builder
.encoding(encoding)
.utf8_passthru(true)
.strip_bom(bom_sniffing)
.bom_override(true)
.bom_sniffing(bom_sniffing)
.build(SyncIoBridge::new(inp));
let oup = tokio::task::spawn_blocking(move || -> Result<Vec<u8>> {
let mut oup = Vec::new();
std::io::Read::read_to_end(&mut inp, &mut oup)?;
Ok(oup)
})
.await??;
Ok(Box::pin(std::io::Cursor::new(oup)))
}
} }
Ok(Box::new(
std::io::Cursor::new(fourk).chain(beginning.into_inner()),
))*/
} }
/// Adds the given prefix to each line in an `AsyncRead`. /// Adds the given prefix to each line in an `AsyncRead`.

View File

@ -227,7 +227,7 @@ pub fn loop_adapt(
match buf_choose_adapter(file?).await? { match buf_choose_adapter(file?).await? {
Ret::Recurse(ai, adapter, detection_reason, _active_adapters) => { Ret::Recurse(ai, adapter, detection_reason, _active_adapters) => {
if ai.archive_recursion_depth >= ai.config.max_archive_recursion.0 { if ai.archive_recursion_depth >= ai.config.max_archive_recursion.0 {
let s = format!("{}[rga: max archive recursion reached ({})]", ai.line_prefix, ai.archive_recursion_depth).into_bytes(); let s = format!("{}[rga: max archive recursion reached ({})]\n", ai.line_prefix, ai.archive_recursion_depth).into_bytes();
yield Ok(AdaptInfo { yield Ok(AdaptInfo {
inp: Box::pin(Cursor::new(s)), inp: Box::pin(Cursor::new(s)),
..ai ..ai