binary detection and utf16

This commit is contained in:
phiresky 2023-02-21 11:43:17 +01:00
parent 9d9b34f9cc
commit b8cca54770
9 changed files with 111 additions and 53 deletions

View File

@ -9,6 +9,7 @@ pub mod writing;
pub mod zip; pub mod zip;
use crate::{adapted_iter::AdaptedFilesIterBox, config::RgaConfig, matching::*}; use crate::{adapted_iter::AdaptedFilesIterBox, config::RgaConfig, matching::*};
use anyhow::{format_err, Context, Result}; use anyhow::{format_err, Context, Result};
use async_trait::async_trait;
use custom::CustomAdapterConfig; use custom::CustomAdapterConfig;
use custom::BUILTIN_SPAWNING_ADAPTERS; use custom::BUILTIN_SPAWNING_ADAPTERS;
use log::*; use log::*;
@ -76,11 +77,17 @@ impl AdapterMeta {
pub trait GetMetadata { pub trait GetMetadata {
fn metadata(&self) -> &AdapterMeta; fn metadata(&self) -> &AdapterMeta;
} }
#[async_trait]
pub trait FileAdapter: GetMetadata + Send + Sync { pub trait FileAdapter: GetMetadata + Send + Sync {
/// adapt a file. /// adapt a file.
/// ///
/// detection_reason is the Matcher that was used to identify this file. Unless --rga-accurate was given, it is always a FastMatcher /// detection_reason is the Matcher that was used to identify this file. Unless --rga-accurate was given, it is always a FastMatcher
fn adapt(&self, a: AdaptInfo, detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox>; async fn adapt(
&self,
a: AdaptInfo,
detection_reason: &FileMatcher,
) -> Result<AdaptedFilesIterBox>;
} }
pub struct AdaptInfo { pub struct AdaptInfo {

View File

@ -224,8 +224,9 @@ impl CustomSpawningFileAdapter {
Ok(command) Ok(command)
} }
} }
#[async_trait]
impl FileAdapter for CustomSpawningFileAdapter { impl FileAdapter for CustomSpawningFileAdapter {
fn adapt<'a>( async fn adapt(
&self, &self,
ai: AdaptInfo, ai: AdaptInfo,
_detection_reason: &FileMatcher, _detection_reason: &FileMatcher,
@ -314,7 +315,7 @@ mod test {
let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?)); let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
// let r = adapter.adapt(a, &d)?; // let r = adapter.adapt(a, &d)?;
let r = loop_adapt(&adapter, d, a)?; let r = loop_adapt(&adapter, d, a).await?;
let o = adapted_to_vec(r).await?; let o = adapted_to_vec(r).await?;
assert_eq!( assert_eq!(
String::from_utf8(o)?, String::from_utf8(o)?,
@ -368,7 +369,7 @@ PREFIX:Page 1:
Path::new("foo.txt"), Path::new("foo.txt"),
Box::pin(Cursor::new(Vec::from(input))), Box::pin(Cursor::new(Vec::from(input))),
); );
let output = adapter.adapt(a, &d).unwrap(); let output = adapter.adapt(a, &d).await.unwrap();
let oup = adapted_to_vec(output).await?; let oup = adapted_to_vec(output).await?;
println!("output: {}", String::from_utf8_lossy(&oup)); println!("output: {}", String::from_utf8_lossy(&oup));

View File

@ -93,8 +93,13 @@ fn get_inner_filename(filename: &Path) -> PathBuf {
filename.with_file_name(format!("{}{}", stem, new_extension)) filename.with_file_name(format!("{}{}", stem, new_extension))
} }
#[async_trait]
impl FileAdapter for DecompressAdapter { impl FileAdapter for DecompressAdapter {
fn adapt(&self, ai: AdaptInfo, detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox> { async fn adapt(
&self,
ai: AdaptInfo,
detection_reason: &FileMatcher,
) -> Result<AdaptedFilesIterBox> {
Ok(one_file(AdaptInfo { Ok(one_file(AdaptInfo {
filepath_hint: get_inner_filename(&ai.filepath_hint), filepath_hint: get_inner_filename(&ai.filepath_hint),
is_real_file: false, is_real_file: false,
@ -137,7 +142,7 @@ mod tests {
let filepath = test_data_dir().join("hello.gz"); let filepath = test_data_dir().join("hello.gz");
let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?)); let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
let r = adapter.adapt(a, &d)?; let r = adapter.adapt(a, &d).await?;
let o = adapted_to_vec(r).await?; let o = adapted_to_vec(r).await?;
assert_eq!(String::from_utf8(o)?, "hello\n"); assert_eq!(String::from_utf8(o)?, "hello\n");
Ok(()) Ok(())
@ -150,7 +155,7 @@ mod tests {
let filepath = test_data_dir().join("short.pdf.gz"); let filepath = test_data_dir().join("short.pdf.gz");
let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?)); let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
let r = loop_adapt(&adapter, d, a)?; let r = loop_adapt(&adapter, d, a).await?;
let o = adapted_to_vec(r).await?; let o = adapted_to_vec(r).await?;
assert_eq!( assert_eq!(
String::from_utf8(o)?, String::from_utf8(o)?,

View File

@ -4,6 +4,7 @@
use anyhow::Result; use anyhow::Result;
use async_stream::stream; use async_stream::stream;
use async_trait::async_trait;
use bytes::Bytes; use bytes::Bytes;
use encoding_rs::Encoding; use encoding_rs::Encoding;
use encoding_rs_io::DecodeReaderBytesBuilder; use encoding_rs_io::DecodeReaderBytesBuilder;
@ -44,8 +45,9 @@ impl GetMetadata for PostprocPrefix {
&METADATA &METADATA
} }
} }
#[async_trait]
impl FileAdapter for PostprocPrefix { impl FileAdapter for PostprocPrefix {
fn adapt<'a>( async fn adapt(
&self, &self,
a: super::AdaptInfo, a: super::AdaptInfo,
_detection_reason: &crate::matching::FileMatcher, _detection_reason: &crate::matching::FileMatcher,
@ -87,18 +89,12 @@ async fn postproc_encoding(
let mut beginning = inp.take(1 << 13); let mut beginning = inp.take(1 << 13);
beginning.read_to_end(&mut fourk).await?; beginning.read_to_end(&mut fourk).await?;
let has_binary = fourk.contains(&0u8);
if fourk.contains(&0u8) {
log::debug!("detected binary");
let v = "[rga: binary data]";
return Ok(Box::pin(std::io::Cursor::new(v)));
}
let enc = Encoding::for_bom(&fourk); let enc = Encoding::for_bom(&fourk);
let inp = std::io::Cursor::new(fourk).chain(beginning.into_inner()); let inp = Cursor::new(fourk).chain(beginning.into_inner());
match enc { match enc {
None => Ok(Box::pin(inp)), Some((enc, _)) if enc != encoding_rs::UTF_8 => {
Some((enc, _)) if enc == encoding_rs::UTF_8 => Ok(Box::pin(inp)),
Some(_) => {
// detected UTF16LE or UTF16BE, convert to UTF8 in separate thread // detected UTF16LE or UTF16BE, convert to UTF8 in separate thread
// TODO: parse these options from ripgrep's configuration // TODO: parse these options from ripgrep's configuration
let encoding = None; // detect bom but usually assume utf8 let encoding = None; // detect bom but usually assume utf8
@ -120,7 +116,14 @@ async fn postproc_encoding(
Ok(oup) Ok(oup)
}) })
.await??; .await??;
Ok(Box::pin(std::io::Cursor::new(oup))) Ok(Box::pin(Cursor::new(oup)))
}
_ => {
if has_binary {
log::debug!("detected binary");
return Ok(Box::pin(Cursor::new("[rga: binary data]")));
}
Ok(Box::pin(inp))
} }
} }
} }
@ -169,13 +172,14 @@ impl GetMetadata for PostprocPageBreaks {
&METADATA &METADATA
} }
} }
#[async_trait]
impl FileAdapter for PostprocPageBreaks { impl FileAdapter for PostprocPageBreaks {
fn adapt<'a>( async fn adapt(
&self, &self,
a: super::AdaptInfo, a: super::AdaptInfo,
_detection_reason: &crate::matching::FileMatcher, _detection_reason: &crate::matching::FileMatcher,
) -> Result<AdaptedFilesIterBox> { ) -> Result<AdaptedFilesIterBox> {
let read = postproc_pagebreaks(postproc_encoding(&a.line_prefix, a.inp)?); let read = postproc_pagebreaks(postproc_encoding(&a.line_prefix, a.inp).await?);
// keep adapt info (filename etc) except replace inp // keep adapt info (filename etc) except replace inp
let ai = AdaptInfo { let ai = AdaptInfo {
inp: Box::pin(read), inp: Box::pin(read),
@ -287,7 +291,7 @@ mod tests {
let fname = test_data_dir().join("twoblankpages.pdf"); let fname = test_data_dir().join("twoblankpages.pdf");
let rd = File::open(&fname).await?; let rd = File::open(&fname).await?;
let (a, d) = simple_adapt_info(&fname, Box::pin(rd)); let (a, d) = simple_adapt_info(&fname, Box::pin(rd));
let res = loop_adapt(&adapter, d, a)?; let res = loop_adapt(&adapter, d, a).await?;
let buf = adapted_to_vec(res).await?; let buf = adapted_to_vec(res).await?;
@ -332,7 +336,8 @@ PREFIX:Page 3:
b: &str, b: &str,
) -> Result<()> { ) -> Result<()> {
let mut oup = Vec::new(); let mut oup = Vec::new();
let inp = postproc_encoding("", a)?; let inp = Box::pin(Cursor::new(a));
let inp = postproc_encoding("", inp).await?;
if pagebreaks { if pagebreaks {
postproc_pagebreaks(inp).read_to_end(&mut oup).await?; postproc_pagebreaks(inp).read_to_end(&mut oup).await?;
} else { } else {
@ -346,6 +351,23 @@ PREFIX:Page 3:
Ok(()) Ok(())
} }
#[tokio::test]
async fn test_utf16() -> Result<()> {
let utf16lebom: &[u8] = &[
0xff, 0xfe, 0x68, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x20, 0x00,
0x77, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x6c, 0x00, 0x64, 0x00, 0x20, 0x00, 0x3d, 0xd8,
0xa9, 0xdc, 0x0a, 0x00,
];
let utf16bebom: &[u8] = &[
0xfe, 0xff, 0x00, 0x68, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x20,
0x00, 0x77, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x6c, 0x00, 0x64, 0x00, 0x20, 0xd8, 0x3d,
0xdc, 0xa9, 0x00, 0x0a,
];
test_from_bytes(false, "", utf16lebom, "hello world 💩\n").await?;
test_from_bytes(false, "", utf16bebom, "hello world 💩\n").await?;
Ok(())
}
#[tokio::test] #[tokio::test]
async fn post1() -> Result<()> { async fn post1() -> Result<()> {
let inp = "What is this\nThis is a test\nFoo"; let inp = "What is this\nThis is a test\nFoo";
@ -367,8 +389,7 @@ PREFIX:Page 3:
Ok(()) Ok(())
} }
/*
todo: uncomment when fixed
#[tokio::test] #[tokio::test]
async fn test_binary_content() -> Result<()> { async fn test_binary_content() -> Result<()> {
test_from_strs( test_from_strs(
@ -380,7 +401,7 @@ PREFIX:Page 3:
.await?; .await?;
test_from_strs(false, "foo:", "\0", "foo:[rga: binary data]").await?; test_from_strs(false, "foo:", "\0", "foo:[rga: binary data]").await?;
Ok(()) Ok(())
}*/ }
/*#[test] /*#[test]
fn chardet() -> Result<()> { fn chardet() -> Result<()> {

View File

@ -137,7 +137,7 @@ mod test {
let adapter: Box<dyn FileAdapter> = Box::new(SqliteAdapter::default()); let adapter: Box<dyn FileAdapter> = Box::new(SqliteAdapter::default());
let fname = test_data_dir().join("hello.sqlite3"); let fname = test_data_dir().join("hello.sqlite3");
let (a, d) = simple_fs_adapt_info(&fname).await?; let (a, d) = simple_fs_adapt_info(&fname).await?;
let res = adapter.adapt(a, &d)?; let res = adapter.adapt(a, &d).await?;
let buf = adapted_to_vec(res).await?; let buf = adapted_to_vec(res).await?;

View File

@ -6,6 +6,7 @@ use crate::{
}; };
use anyhow::*; use anyhow::*;
use async_stream::stream; use async_stream::stream;
use async_trait::async_trait;
use lazy_static::lazy_static; use lazy_static::lazy_static;
use log::*; use log::*;
use std::path::PathBuf; use std::path::PathBuf;
@ -45,8 +46,13 @@ impl GetMetadata for TarAdapter {
} }
} }
#[async_trait]
impl FileAdapter for TarAdapter { impl FileAdapter for TarAdapter {
fn adapt(&self, ai: AdaptInfo, _detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox> { async fn adapt(
&self,
ai: AdaptInfo,
_detection_reason: &FileMatcher,
) -> Result<AdaptedFilesIterBox> {
let AdaptInfo { let AdaptInfo {
filepath_hint, filepath_hint,
inp, inp,
@ -103,7 +109,7 @@ mod tests {
let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?)); let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
let adapter = TarAdapter::new(); let adapter = TarAdapter::new();
let r = loop_adapt(&adapter, d, a).context("adapt")?; let r = loop_adapt(&adapter, d, a).await.context("adapt")?;
let o = adapted_to_vec(r).await.context("adapted_to_vec")?; let o = adapted_to_vec(r).await.context("adapted_to_vec")?;
assert_eq!( assert_eq!(
String::from_utf8(o).context("parsing utf8")?, String::from_utf8(o).context("parsing utf8")?,

View File

@ -41,11 +41,12 @@ macro_rules! async_writeln {
} }
pub(crate) use async_writeln; pub(crate) use async_writeln;
#[async_trait]
impl<T> FileAdapter for T impl<T> FileAdapter for T
where where
T: WritingFileAdapter, T: WritingFileAdapter,
{ {
fn adapt( async fn adapt(
&self, &self,
a: super::AdaptInfo, a: super::AdaptInfo,
detection_reason: &crate::matching::FileMatcher, detection_reason: &crate::matching::FileMatcher,

View File

@ -36,8 +36,13 @@ impl GetMetadata for ZipAdapter {
} }
} }
#[async_trait]
impl FileAdapter for ZipAdapter { impl FileAdapter for ZipAdapter {
fn adapt(&self, ai: AdaptInfo, _detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox> { async fn adapt(
&self,
ai: AdaptInfo,
_detection_reason: &FileMatcher,
) -> Result<AdaptedFilesIterBox> {
// let (s, r) = mpsc::channel(1); // let (s, r) = mpsc::channel(1);
let AdaptInfo { let AdaptInfo {
inp, inp,
@ -52,8 +57,8 @@ impl FileAdapter for ZipAdapter {
if is_real_file { if is_real_file {
use async_zip::read::fs::ZipFileReader; use async_zip::read::fs::ZipFileReader;
let s = stream! {
let zip = ZipFileReader::new(&filepath_hint).await?; let zip = ZipFileReader::new(&filepath_hint).await?;
let s = stream! {
for i in 0..zip.entries().len() { for i in 0..zip.entries().len() {
let reader = zip.entry_reader(i).await?; let reader = zip.entry_reader(i).await?;
let file = reader.entry(); let file = reader.entry();
@ -183,7 +188,6 @@ impl<'a> AdaptedFilesIter for ZipAdaptIter<'a> {
mod test { mod test {
use async_zip::{write::ZipFileWriter, Compression, ZipEntryBuilder}; use async_zip::{write::ZipFileWriter, Compression, ZipEntryBuilder};
use super::*; use super::*;
use crate::{preproc::loop_adapt, test_utils::*}; use crate::{preproc::loop_adapt, test_utils::*};
use pretty_assertions::assert_eq; use pretty_assertions::assert_eq;
@ -213,7 +217,7 @@ mod test {
async fn only_seek_zip_fs() -> Result<()> { async fn only_seek_zip_fs() -> Result<()> {
let zip = test_data_dir().join("only-seek-zip.zip"); let zip = test_data_dir().join("only-seek-zip.zip");
let (a, d) = simple_fs_adapt_info(&zip).await?; let (a, d) = simple_fs_adapt_info(&zip).await?;
let _v = adapted_to_vec(loop_adapt(&ZipAdapter::new(), d, a)?).await?; let _v = adapted_to_vec(loop_adapt(&ZipAdapter::new(), d, a).await?).await?;
// assert_eq!(String::from_utf8(v)?, ""); // assert_eq!(String::from_utf8(v)?, "");
Ok(()) Ok(())
@ -236,7 +240,7 @@ mod test {
&PathBuf::from("outer.zip"), &PathBuf::from("outer.zip"),
Box::pin(std::io::Cursor::new(zipfile)), Box::pin(std::io::Cursor::new(zipfile)),
); );
let buf = adapted_to_vec(loop_adapt(&adapter, d, a)?).await?; let buf = adapted_to_vec(loop_adapt(&adapter, d, a).await?).await?;
assert_eq!( assert_eq!(
String::from_utf8(buf)?, String::from_utf8(buf)?,

View File

@ -11,11 +11,14 @@ use crate::{
use anyhow::*; use anyhow::*;
use async_compression::tokio::bufread::ZstdDecoder; use async_compression::tokio::bufread::ZstdDecoder;
use async_stream::stream; use async_stream::stream;
// use futures::future::{BoxFuture, FutureExt};
use log::*; use log::*;
use path_clean::PathClean; use path_clean::PathClean;
use postproc::PostprocPrefix; use postproc::PostprocPrefix;
use std::future::Future;
use std::io::Cursor; use std::io::Cursor;
use std::path::Path; use std::path::Path;
use std::pin::Pin;
use std::sync::Arc; use std::sync::Arc;
use tokio::io::AsyncBufRead; use tokio::io::AsyncBufRead;
use tokio::io::AsyncBufReadExt; use tokio::io::AsyncBufReadExt;
@ -185,7 +188,7 @@ async fn adapt_caching(
Some(cached) => Ok(Box::pin(ZstdDecoder::new(Cursor::new(cached)))), Some(cached) => Ok(Box::pin(ZstdDecoder::new(Cursor::new(cached)))),
None => { None => {
debug!("cache MISS, running adapter with caching..."); debug!("cache MISS, running adapter with caching...");
let inp = loop_adapt(adapter.as_ref(), detection_reason, ai)?; let inp = loop_adapt(adapter.as_ref(), detection_reason, ai).await?;
let inp = concat_read_streams(inp); let inp = concat_read_streams(inp);
let inp = async_read_and_write_to_cache( let inp = async_read_and_write_to_cache(
inp, inp,
@ -213,9 +216,19 @@ pub fn loop_adapt(
adapter: &dyn FileAdapter, adapter: &dyn FileAdapter,
detection_reason: FileMatcher, detection_reason: FileMatcher,
ai: AdaptInfo, ai: AdaptInfo,
) -> Pin<Box<dyn Future<Output = anyhow::Result<AdaptedFilesIterBox>> + Send + '_>> {
Box::pin(async move { loop_adapt_inner(adapter, detection_reason, ai).await })
}
pub async fn loop_adapt_inner(
adapter: &dyn FileAdapter,
detection_reason: FileMatcher,
ai: AdaptInfo,
) -> anyhow::Result<AdaptedFilesIterBox> { ) -> anyhow::Result<AdaptedFilesIterBox> {
let fph = ai.filepath_hint.clone(); let fph = ai.filepath_hint.clone();
let inp = adapter.adapt(ai, &detection_reason).with_context(|| { let inp = adapter
.adapt(ai, &detection_reason)
.await
.with_context(|| {
format!( format!(
"adapting {} via {} failed", "adapting {} via {} failed",
fph.to_string_lossy(), fph.to_string_lossy(),
@ -243,7 +256,7 @@ pub fn loop_adapt(
ai.filepath_hint.to_string_lossy(), ai.filepath_hint.to_string_lossy(),
&adapter.metadata().name &adapter.metadata().name
); );
for await ifile in loop_adapt(adapter.as_ref(), detection_reason, ai)? { for await ifile in loop_adapt(adapter.as_ref(), detection_reason, ai).await? {
yield ifile; yield ifile;
} }
} }