From d5606094f577c803fea064fdd766d0d4bd56202b Mon Sep 17 00:00:00 2001 From: phiresky Date: Fri, 7 Jun 2019 00:57:53 +0200 Subject: [PATCH] binary file detection --- Cargo.lock | 20 +++++++++++ Cargo.toml | 2 ++ src/adapters/spawning.rs | 72 ++++++++++++---------------------------- src/preproc.rs | 3 +- 4 files changed, 45 insertions(+), 52 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b60f5c7..e6aa29d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -276,6 +276,22 @@ name = "either" version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "encoding_rs" +version = "0.8.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "encoding_rs_io" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "env_logger" version = "0.6.1" @@ -779,6 +795,8 @@ dependencies = [ "chrono 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", "clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)", "crossbeam 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding_rs_io 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", "env_logger 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", "failure 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", "flate2 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)", @@ -1162,6 +1180,8 @@ dependencies = [ "checksum crossbeam-utils 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "2760899e32a1d58d5abb31129f8fae5de75220bc2176e77ff7c627ae45c918d9" "checksum crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "f8306fcef4a7b563b76b7dd949ca48f52bc1141aa067d2ea09565f3e2652aa5c" "checksum either 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "5527cfe0d098f36e3f8839852688e63c8fff1c90b2b405aef730615f9a7bcf7b" +"checksum encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)" = "4155785c79f2f6701f185eb2e6b4caf0555ec03477cb4c70db67b465311620ed" +"checksum encoding_rs_io 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "9619ee7a2bf4e777e020b95c1439abaf008f8ea8041b78a0552c4f1bcf4df32c" "checksum env_logger 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b61fa891024a945da30a9581546e8cfaf5602c7b3f4c137a2805cf388f92075a" "checksum failure 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "795bd83d3abeb9220f257e597aa0080a508b27533824adf336529648f6abf7e2" "checksum failure_derive 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "ea1063915fd7ef4309e222a5a07cf9c319fb9c7836b1f89b85458672dbb127e1" diff --git a/Cargo.toml b/Cargo.toml index a9542c4..5d6c21f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,3 +38,5 @@ flate2 = "1.0.7" bzip2 = "0.3.3" tar = "0.4.26" chrono = "0.4.6" +encoding_rs = "0.8.17" +encoding_rs_io = "0.1.6" diff --git a/src/adapters/spawning.rs b/src/adapters/spawning.rs index bf95056..657d9f8 100644 --- a/src/adapters/spawning.rs +++ b/src/adapters/spawning.rs @@ -5,26 +5,32 @@ use std::io::BufReader; use std::process::Command; use std::process::Stdio; +/** + * Copy a Read to a Write, while prefixing every line with a prefix. + * + * Try to detect binary files and ignore them. Does not ensure any encoding in the output. + */ pub fn postproc_line_prefix( line_prefix: &str, inp: &mut dyn Read, oup: &mut dyn Write, ) -> Fallible<()> { - //std::io::copy(inp, oup)?; - - for line in BufReader::new(inp).lines() { - match line { - Ok(line) => { - oup.write_all(format!("{}{}\n", line_prefix, line).as_bytes())?; - } - Err(e) => { - if e.kind() == std::io::ErrorKind::InvalidData { - oup.write_all(format!("{}[binary]\n", line_prefix).as_bytes())?; - } else { - Err(e)?; - } - } + let mut reader = BufReader::with_capacity(1 << 12, inp); + let fourk = reader.fill_buf()?; + if fourk.contains(&0u8) { + oup.write_all(format!("{}[binary data]\n", line_prefix).as_bytes())?; + return Ok(()); + } + // intentionally do not call reader.consume + for line in reader.split(b'\n') { + let line = line?; + if line.contains(&0u8) { + oup.write_all(format!("{}[binary data]\n", line_prefix).as_bytes())?; + return Ok(()); } + oup.write_all(line_prefix.as_bytes())?; + oup.write_all(&line)?; + oup.write_all(b"\n")?; } Ok(()) } @@ -44,43 +50,6 @@ pub fn map_exe_error(err: std::io::Error, exe_name: &str, help: &str) -> Error { _ => Error::from(err), } } - -/*fn pipe(a: &mut dyn Read, b: &mut dyn Write, c: &mut dyn Read, d: &mut dyn Write) { - let mut buf = vec![0u8; 2 << 13]; - loop { - match a.read(&buf) { - - } - } -}*/ - -/*pub fn copy( - name: &str, - reader: &mut R, - writer: &mut W, -) -> std::io::Result -where - R: Read, - W: Write, -{ - eprintln!("START COPY {}", name); - let mut zz = vec![0; 1 << 13]; - let mut buf: &mut [u8] = zz.as_mut(); - let mut written = 0; - loop { - let r = reader.read(buf); - eprintln!("{}read: {:?}", name, r); - let len = match r { - Ok(0) => return Ok(written), - Ok(len) => len, - Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => continue, - Err(e) => return Err(e), - }; - writer.write_all(&buf[..len])?; - written += len as u64; - } -}*/ - pub fn pipe_output( line_prefix: &str, mut cmd: Command, @@ -98,6 +67,7 @@ pub fn pipe_output( let mut stdi = cmd.stdin.take().expect("is piped"); let mut stdo = cmd.stdout.take().expect("is piped"); + // TODO: how to handle this copying better? crossbeam::scope(|s| -> Fallible<()> { s.spawn(|_| cp(line_prefix, &mut stdo, oup).unwrap()); // errors? std::io::copy(inp, &mut stdi)?; diff --git a/src/preproc.rs b/src/preproc.rs index 98385d3..5e45b69 100644 --- a/src/preproc.rs +++ b/src/preproc.rs @@ -142,7 +142,8 @@ pub fn rga_preproc<'a>( } } None => { - // allow passthrough if the file is in an archive, otherwise it should have been filtered out by rg + // allow passthrough if the file is in an archive, + // otherwise it should have been filtered out by rg pre-glob since rg can handle those better than us let allow_cat = !is_real_file; if allow_cat { spawning::postproc_line_prefix(line_prefix, inp, oup)?;