partial migration to read->read

This commit is contained in:
phiresky 2020-06-11 23:09:31 +02:00
parent d0d74adfe9
commit 2f580b135a
24 changed files with 593 additions and 155 deletions

24
Cargo.lock generated
View File

@ -288,6 +288,27 @@ dependencies = [
"winapi", "winapi",
] ]
[[package]]
name = "dyn-clonable"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4e9232f0e607a262ceb9bd5141a3dfb3e4db6994b31989bbfd845878cba59fd4"
dependencies = [
"dyn-clonable-impl",
"dyn-clone",
]
[[package]]
name = "dyn-clonable-impl"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "558e40ea573c374cf53507fd240b7ee2f5477df7cfebdb97323ec61c719399c5"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]] [[package]]
name = "dyn-clone" name = "dyn-clone"
version = "1.0.1" version = "1.0.1"
@ -950,8 +971,11 @@ dependencies = [
"chrono", "chrono",
"clap", "clap",
"crossbeam", "crossbeam",
"crossbeam-channel",
"derive_more", "derive_more",
"directories-next", "directories-next",
"dyn-clonable",
"dyn-clone",
"encoding_rs", "encoding_rs",
"encoding_rs_io", "encoding_rs_io",
"env_logger", "env_logger",

View File

@ -49,3 +49,6 @@ directories-next = "1.0.1"
derive_more = "0.99.7" derive_more = "0.99.7"
pretty-bytes = "0.2.2" pretty-bytes = "0.2.2"
memchr = "2.3.3" memchr = "2.3.3"
crossbeam-channel = "0.4.2"
dyn-clone = "1.0.1"
dyn-clonable = "0.9.0"

BIN
exampledir/test.djvu Normal file

Binary file not shown.

BIN
exampledir/test/hello.gz Normal file

Binary file not shown.

Binary file not shown.

BIN
exampledir/test/short.pdf Normal file

Binary file not shown.

Binary file not shown.

View File

@ -1,28 +1,31 @@
pub mod custom; pub mod custom;
pub mod decompress; pub mod decompress;
pub mod ffmpeg; //pub mod ffmpeg;
pub mod fns; pub mod fns;
pub mod pdfpages; //pub mod pdfpages;
pub mod poppler; pub mod poppler;
pub mod spawning; pub mod spawning;
pub mod sqlite; pub mod sqlite;
pub mod tar; //pub mod tar;
pub mod tesseract; //pub mod tesseract;
pub mod zip; pub mod writing;
// pub mod zip;
use crate::matching::*; use crate::matching::*;
use crate::preproc::PreprocConfig; use crate::preproc::PreprocConfig;
use anyhow::*; use anyhow::*;
use custom::builtin_spawning_adapters; use custom::builtin_spawning_adapters;
use custom::CustomAdapterConfig; use custom::CustomAdapterConfig;
use log::*; use log::*;
use regex::Regex;
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::HashMap; use std::collections::HashMap;
use std::io::prelude::*; use std::io::prelude::*;
use std::iter::Iterator; use std::iter::Iterator;
use std::path::Path; use std::path::{Path, PathBuf};
use std::rc::Rc; use std::rc::Rc;
pub type ReadBox = Box<dyn Read + Send>;
pub struct AdapterMeta { pub struct AdapterMeta {
/// unique short name of this adapter (a-z0-9 only) /// unique short name of this adapter (a-z0-9 only)
pub name: String, pub name: String,
@ -63,22 +66,20 @@ pub trait FileAdapter: GetMetadata {
/// adapt a file. /// adapt a file.
/// ///
/// detection_reason is the Matcher that was used to identify this file. Unless --rga-accurate was given, it is always a FastMatcher /// detection_reason is the Matcher that was used to identify this file. Unless --rga-accurate was given, it is always a FastMatcher
fn adapt(&self, a: AdaptInfo, detection_reason: &SlowMatcher) -> Result<()>; fn adapt(&self, a: AdaptInfo, detection_reason: &SlowMatcher) -> Result<ReadBox>;
} }
pub struct AdaptInfo<'a> { pub struct AdaptInfo {
/// file path. May not be an actual file on the file system (e.g. in an archive). Used for matching file extensions. /// file path. May not be an actual file on the file system (e.g. in an archive). Used for matching file extensions.
pub filepath_hint: &'a Path, pub filepath_hint: PathBuf,
/// true if filepath_hint is an actual file on the file system /// true if filepath_hint is an actual file on the file system
pub is_real_file: bool, pub is_real_file: bool,
/// depth at which this file is in archives. 0 for real filesystem /// depth at which this file is in archives. 0 for real filesystem
pub archive_recursion_depth: i32, pub archive_recursion_depth: i32,
/// stream to read the file from. can be from a file or from some decoder /// stream to read the file from. can be from a file or from some decoder
pub inp: &'a mut dyn Read, pub inp: ReadBox,
/// stream to write to. will be written to from a different thread
pub oup: &'a mut (dyn Write + Send),
/// prefix every output line with this string to better indicate the file's location if it is in some archive /// prefix every output line with this string to better indicate the file's location if it is in some archive
pub line_prefix: &'a str, pub line_prefix: String,
pub config: PreprocConfig<'a>, pub config: PreprocConfig,
} }
/// (enabledAdapters, disabledAdapters) /// (enabledAdapters, disabledAdapters)
@ -94,13 +95,13 @@ pub fn get_all_adapters(custom_adapters: Option<Vec<CustomAdapterConfig>>) -> Ad
} }
let internal_adapters: Vec<Rc<dyn FileAdapter>> = vec![ let internal_adapters: Vec<Rc<dyn FileAdapter>> = vec![
Rc::new(ffmpeg::FFmpegAdapter::new()), //Rc::new(ffmpeg::FFmpegAdapter::new()),
Rc::new(zip::ZipAdapter::new()), //Rc::new(zip::ZipAdapter::new()),
Rc::new(decompress::DecompressAdapter::new()), Rc::new(decompress::DecompressAdapter::new()),
Rc::new(tar::TarAdapter::new()), // Rc::new(tar::TarAdapter::new()),
Rc::new(sqlite::SqliteAdapter::new()), Rc::new(sqlite::SqliteAdapter::new()),
Rc::new(pdfpages::PdfPagesAdapter::new()), // Rc::new(pdfpages::PdfPagesAdapter::new()),
Rc::new(tesseract::TesseractAdapter::new()), //Rc::new(tesseract::TesseractAdapter::new()),
]; ];
adapters.extend( adapters.extend(
builtin_spawning_adapters builtin_spawning_adapters

View File

@ -1,4 +1,7 @@
use super::{spawning::SpawningFileAdapter, AdapterMeta, GetMetadata}; use super::{
spawning::{SpawningFileAdapter, SpawningFileAdapterTrait},
AdapterMeta, GetMetadata,
};
use crate::matching::{FastMatcher, SlowMatcher}; use crate::matching::{FastMatcher, SlowMatcher};
use lazy_static::lazy_static; use lazy_static::lazy_static;
use schemars::JsonSchema; use schemars::JsonSchema;
@ -112,7 +115,7 @@ impl GetMetadata for CustomSpawningFileAdapter {
&self.meta &self.meta
} }
} }
impl SpawningFileAdapter for CustomSpawningFileAdapter { impl SpawningFileAdapterTrait for CustomSpawningFileAdapter {
fn get_exe(&self) -> &str { fn get_exe(&self) -> &str {
&self.binary &self.binary
} }
@ -126,12 +129,12 @@ impl SpawningFileAdapter for CustomSpawningFileAdapter {
} }
} }
impl CustomAdapterConfig { impl CustomAdapterConfig {
pub fn to_adapter(self) -> CustomSpawningFileAdapter { pub fn to_adapter(&self) -> SpawningFileAdapter {
CustomSpawningFileAdapter { let ad = CustomSpawningFileAdapter {
binary: self.binary.clone(), binary: self.binary.clone(),
args: self.args.clone(), args: self.args.clone(),
meta: AdapterMeta { meta: AdapterMeta {
name: self.name, name: self.name.clone(),
version: self.version, version: self.version,
description: format!( description: format!(
"{}\nRuns: {} {}", "{}\nRuns: {} {}",
@ -145,7 +148,7 @@ impl CustomAdapterConfig {
.iter() .iter()
.map(|s| FastMatcher::FileExtension(s.to_string())) .map(|s| FastMatcher::FileExtension(s.to_string()))
.collect(), .collect(),
slow_matchers: self.mimetypes.map(|mimetypes| { slow_matchers: self.mimetypes.as_ref().map(|mimetypes| {
mimetypes mimetypes
.iter() .iter()
.map(|s| SlowMatcher::MimeType(s.to_string())) .map(|s| SlowMatcher::MimeType(s.to_string()))
@ -153,6 +156,43 @@ impl CustomAdapterConfig {
}), }),
disabled_by_default: self.disabled_by_default.unwrap_or(false), disabled_by_default: self.disabled_by_default.unwrap_or(false),
}, },
} };
SpawningFileAdapter::new(Box::new(ad))
}
}
#[cfg(test)]
mod test {
use super::super::FileAdapter;
use super::*;
use crate::test_utils::*;
use anyhow::Result;
use std::fs::File;
#[test]
fn poppler() -> Result<()> {
let adapter = builtin_spawning_adapters
.iter()
.find(|e| e.name == "poppler")
.expect("no poppler adapter");
let adapter = adapter.to_adapter();
let filepath = test_data_dir().join("short.pdf");
let (a, d) = simple_adapt_info(&filepath, Box::new(File::open(&filepath)?));
let mut r = adapter.adapt(a, &d)?;
let mut o = Vec::new();
r.read_to_end(&mut o)?;
assert_eq!(
String::from_utf8(o)?,
"hello world
this is just a test.
1
\u{c}"
);
Ok(())
} }
} }

View File

@ -1,6 +1,6 @@
use super::*; use super::*;
use crate::preproc::rga_preproc; use crate::preproc::rga_preproc;
use anyhow::*; use anyhow::Result;
use lazy_static::lazy_static; use lazy_static::lazy_static;
use std::path::PathBuf; use std::path::PathBuf;
@ -47,16 +47,13 @@ impl GetMetadata for DecompressAdapter {
} }
} }
fn decompress_any<'a, R>(reason: &SlowMatcher, inp: &'a mut R) -> Result<Box<dyn Read + 'a>> fn decompress_any(reason: &SlowMatcher, inp: ReadBox) -> Result<ReadBox> {
where
R: Read,
{
use FastMatcher::*; use FastMatcher::*;
use SlowMatcher::*; use SlowMatcher::*;
let gz = |inp: &'a mut R| Box::new(flate2::read::MultiGzDecoder::new(inp)); let gz = |inp: ReadBox| Box::new(flate2::read::MultiGzDecoder::new(inp));
let bz2 = |inp: &'a mut R| Box::new(bzip2::read::BzDecoder::new(inp)); let bz2 = |inp: ReadBox| Box::new(bzip2::read::BzDecoder::new(inp));
let xz = |inp: &'a mut R| Box::new(xz2::read::XzDecoder::new_multi_decoder(inp)); let xz = |inp: ReadBox| Box::new(xz2::read::XzDecoder::new_multi_decoder(inp));
let zst = |inp: &'a mut R| zstd::stream::read::Decoder::new(inp); // returns result let zst = |inp: ReadBox| zstd::stream::read::Decoder::new(inp); // returns result
Ok(match reason { Ok(match reason {
Fast(FileExtension(ext)) => match ext.as_ref() { Fast(FileExtension(ext)) => match ext.as_ref() {
@ -92,35 +89,33 @@ fn get_inner_filename(filename: &Path) -> PathBuf {
} }
impl FileAdapter for DecompressAdapter { impl FileAdapter for DecompressAdapter {
fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Result<()> { fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Result<ReadBox> {
let AdaptInfo { let AdaptInfo {
filepath_hint, filepath_hint,
mut inp, inp,
oup,
line_prefix, line_prefix,
archive_recursion_depth, archive_recursion_depth,
config, config,
.. ..
} = ai; } = ai;
let mut decompress = decompress_any(detection_reason, &mut inp)?;
let ai2: AdaptInfo = AdaptInfo { let ai2: AdaptInfo = AdaptInfo {
filepath_hint: &get_inner_filename(filepath_hint), filepath_hint: get_inner_filename(&filepath_hint),
is_real_file: false, is_real_file: false,
archive_recursion_depth: archive_recursion_depth + 1, archive_recursion_depth: archive_recursion_depth + 1,
inp: &mut decompress, inp: decompress_any(detection_reason, inp)?,
oup,
line_prefix, line_prefix,
config: config.clone(), config: config.clone(),
}; };
rga_preproc(ai2)?; rga_preproc(ai2)
Ok(())
} }
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::test_utils::*;
use std::fs::File;
#[test] #[test]
fn test_inner_filename() { fn test_inner_filename() {
for (a, b) in &[ for (a, b) in &[
@ -135,4 +130,40 @@ mod tests {
assert_eq!(get_inner_filename(&PathBuf::from(a)), PathBuf::from(*b)); assert_eq!(get_inner_filename(&PathBuf::from(a)), PathBuf::from(*b));
} }
} }
#[test]
fn gz() -> Result<()> {
let adapter = DecompressAdapter;
let filepath = test_data_dir().join("hello.gz");
let (a, d) = simple_adapt_info(&filepath, Box::new(File::open(&filepath)?));
let mut r = adapter.adapt(a, &d)?;
let mut o = Vec::new();
r.read_to_end(&mut o)?;
assert_eq!(String::from_utf8(o)?, "hello\n");
Ok(())
}
#[test]
fn pdf_gz() -> Result<()> {
let adapter = DecompressAdapter;
let filepath = test_data_dir().join("short.pdf.gz");
let (a, d) = simple_adapt_info(&filepath, Box::new(File::open(&filepath)?));
let mut r = adapter.adapt(a, &d)?;
let mut o = Vec::new();
r.read_to_end(&mut o)?;
assert_eq!(
String::from_utf8(o)?,
"hello world
this is just a test.
1
\u{c}"
);
Ok(())
}
} }

View File

@ -92,7 +92,7 @@ where
} }
} }
pub fn postprocB(line_prefix: &str, inp: impl Read) -> Result<impl Read> { pub fn postprocB(_line_prefix: &str, inp: impl Read) -> Result<impl Read> {
let mut page_count = 1; let mut page_count = 1;
Ok(ByteReplacer { Ok(ByteReplacer {

View File

@ -1,8 +1,8 @@
use super::*;
use lazy_static::lazy_static;
use spawning::SpawningFileAdapter;
use std::io::BufReader;
use std::process::Command;
/* /*
static EXTENSIONS: &[&str] = &["pdf"]; static EXTENSIONS: &[&str] = &["pdf"];

View File

@ -5,7 +5,7 @@ use log::*;
use std::io::prelude::*; use std::io::prelude::*;
use std::io::BufReader; use std::io::BufReader;
use std::process::Command; use std::process::Command;
use std::process::Stdio; use std::process::{Child, Stdio};
/** /**
* Copy a Read to a Write, while prefixing every line with a prefix. * Copy a Read to a Write, while prefixing every line with a prefix.
@ -53,15 +53,37 @@ pub fn postproc_line_prefix(
} }
Ok(()) Ok(())
} }
pub trait SpawningFileAdapter: GetMetadata { pub trait SpawningFileAdapterTrait: GetMetadata {
fn get_exe(&self) -> &str; fn get_exe(&self) -> &str;
fn command(&self, filepath_hint: &Path, command: Command) -> Command; fn command(&self, filepath_hint: &Path, command: Command) -> Command;
fn postproc(line_prefix: &str, inp: &mut dyn Read, oup: &mut dyn Write) -> Result<()> { /*fn postproc(&self, line_prefix: &str, inp: &mut dyn Read, oup: &mut dyn Write) -> Result<()> {
postproc_line_prefix(line_prefix, inp, oup) postproc_line_prefix(line_prefix, inp, oup)
}*/
}
pub struct SpawningFileAdapter {
inner: Box<dyn SpawningFileAdapterTrait>,
}
impl SpawningFileAdapter {
pub fn new(inner: Box<dyn SpawningFileAdapterTrait>) -> SpawningFileAdapter {
SpawningFileAdapter { inner }
} }
} }
impl GetMetadata for SpawningFileAdapter {
fn metadata(&self) -> &AdapterMeta {
self.inner.metadata()
}
}
/*impl<T: SpawningFileAdapterTrait> From<T> for SpawningFileAdapter {
fn from(e: dyn T) -> Self {
SpawningFileAdapter { inner: Box::new(e) }
}
}*/
/// replace a Command.spawn() error "File not found" with a more readable error /// replace a Command.spawn() error "File not found" with a more readable error
/// to indicate some program is not installed /// to indicate some program is not installed
pub fn map_exe_error(err: std::io::Error, exe_name: &str, help: &str) -> Error { pub fn map_exe_error(err: std::io::Error, exe_name: &str, help: &str) -> Error {
@ -71,63 +93,61 @@ pub fn map_exe_error(err: std::io::Error, exe_name: &str, help: &str) -> Error {
_ => Error::from(err), _ => Error::from(err),
} }
} }
struct ProcWaitReader {
proce: Child,
}
impl Read for ProcWaitReader {
fn read(&mut self, _buf: &mut [u8]) -> std::io::Result<usize> {
let status = self.proce.wait()?;
if status.success() {
Ok(0)
} else {
Err(std::io::Error::new(
std::io::ErrorKind::Other,
format_err!("subprocess failed: {:?}", status),
))
}
}
}
pub fn pipe_output( pub fn pipe_output(
line_prefix: &str, _line_prefix: &str,
mut cmd: Command, mut cmd: Command,
inp: &mut (dyn Read), inp: &mut (dyn Read),
oup: &mut (dyn Write + Send),
exe_name: &str, exe_name: &str,
help: &str, help: &str,
cp: fn(line_prefix: &str, &mut dyn Read, &mut dyn Write) -> Result<()>, ) -> Result<ReadBox> {
) -> Result<()> {
let mut cmd = cmd let mut cmd = cmd
.stdin(Stdio::piped()) .stdin(Stdio::piped())
.stdout(Stdio::piped()) .stdout(Stdio::piped())
.spawn() .spawn()
.map_err(|e| map_exe_error(e, exe_name, help))?; .map_err(|e| map_exe_error(e, exe_name, help))?;
let mut stdi = cmd.stdin.take().expect("is piped"); let mut stdi = cmd.stdin.take().expect("is piped");
let mut stdo = cmd.stdout.take().expect("is piped"); let stdo = cmd.stdout.take().expect("is piped");
// TODO: how to handle this copying better? // TODO: how to handle this copying better?
// do we really need threads for this? // do we really need threads for this?
crossbeam::scope(|s| -> Result<()> { crossbeam::scope(|_s| -> Result<()> {
s.spawn(|_| cp(line_prefix, &mut stdo, oup).unwrap()); // errors?
std::io::copy(inp, &mut stdi)?; std::io::copy(inp, &mut stdi)?;
drop(stdi); // NEEDED! otherwise deadlock drop(stdi); // NEEDED! otherwise deadlock
Ok(()) Ok(())
}) })
.unwrap()?; .unwrap()?;
let status = cmd.wait()?; Ok(Box::new(stdo.chain(ProcWaitReader { proce: cmd })))
if status.success() {
Ok(())
} else {
Err(format_err!("subprocess failed: {:?}", status))
}
} }
impl<T> FileAdapter for T impl FileAdapter for SpawningFileAdapter {
where fn adapt(&self, ai: AdaptInfo, _detection_reason: &SlowMatcher) -> Result<ReadBox> {
T: SpawningFileAdapter,
{
fn adapt(&self, ai: AdaptInfo, _detection_reason: &SlowMatcher) -> Result<()> {
let AdaptInfo { let AdaptInfo {
filepath_hint, filepath_hint,
mut inp, mut inp,
oup,
line_prefix, line_prefix,
.. ..
} = ai; } = ai;
let cmd = Command::new(self.get_exe());
let cmd = self.command(filepath_hint, cmd); let cmd = Command::new(self.inner.get_exe());
let cmd = self.inner.command(&filepath_hint, cmd);
debug!("executing {:?}", cmd); debug!("executing {:?}", cmd);
pipe_output( pipe_output(&line_prefix, cmd, &mut inp, self.inner.get_exe(), "")
line_prefix,
cmd,
&mut inp,
oup,
self.get_exe(),
"",
Self::postproc,
)
} }
} }

View File

@ -5,6 +5,7 @@ use log::*;
use rusqlite::types::ValueRef; use rusqlite::types::ValueRef;
use rusqlite::*; use rusqlite::*;
use std::convert::TryInto; use std::convert::TryInto;
use writing::{WritingFileAdapter, WritingFileAdapterTrait};
static EXTENSIONS: &[&str] = &["db", "db3", "sqlite", "sqlite3"]; static EXTENSIONS: &[&str] = &["db", "db3", "sqlite", "sqlite3"];
@ -27,12 +28,12 @@ lazy_static! {
}; };
} }
#[derive(Default)] #[derive(Default, Clone)]
pub struct SqliteAdapter; pub struct SqliteAdapter;
impl SqliteAdapter { impl SqliteAdapter {
pub fn new() -> SqliteAdapter { pub fn new() -> WritingFileAdapter {
SqliteAdapter WritingFileAdapter::new(Box::new(SqliteAdapter {}))
} }
} }
impl GetMetadata for SqliteAdapter { impl GetMetadata for SqliteAdapter {
@ -58,12 +59,16 @@ fn format_blob(b: ValueRef) -> String {
} }
} }
impl FileAdapter for SqliteAdapter { impl WritingFileAdapterTrait for SqliteAdapter {
fn adapt(&self, ai: AdaptInfo, _detection_reason: &SlowMatcher) -> Result<()> { fn adapt_write(
&self,
ai: AdaptInfo,
_detection_reason: &SlowMatcher,
oup: &mut dyn Write,
) -> Result<()> {
let AdaptInfo { let AdaptInfo {
is_real_file, is_real_file,
filepath_hint, filepath_hint,
oup,
line_prefix, line_prefix,
.. ..
} = ai; } = ai;
@ -116,3 +121,29 @@ impl FileAdapter for SqliteAdapter {
Ok(()) Ok(())
} }
} }
#[cfg(test)]
mod test {
use super::*;
use crate::{test_utils::*};
use std::{fs::File};
#[test]
fn simple() -> Result<()> {
let adapter: Box<dyn FileAdapter> = Box::new(SqliteAdapter::new());
let fname = test_data_dir().join("hello.sqlite3");
let rd = File::open(&fname)?;
let (a, d) = simple_adapt_info(&fname, Box::new(rd));
let mut res = adapter.adapt(a, &d)?;
let mut buf = Vec::new();
res.read_to_end(&mut buf)?;
assert_eq!(
String::from_utf8(buf)?,
"PREFIX:tbl: greeting='hello', from='sqlite database!'\nPREFIX:tbl2: x=123, y=456.789\n",
);
Ok(())
}
}

View File

@ -1,6 +1,6 @@
use super::*; use super::*;
use lazy_static::lazy_static; use lazy_static::lazy_static;
use spawning::SpawningFileAdapter; use spawning::{SpawningFileAdapter, SpawningFileAdapterTrait};
use std::process::Command; use std::process::Command;
static EXTENSIONS: &[&str] = &["jpg", "png"]; static EXTENSIONS: &[&str] = &["jpg", "png"];
@ -33,7 +33,7 @@ impl GetMetadata for TesseractAdapter {
&METADATA &METADATA
} }
} }
impl SpawningFileAdapter for TesseractAdapter { impl SpawningFileAdapterTrait for TesseractAdapter {
fn get_exe(&self) -> &str { fn get_exe(&self) -> &str {
"tesseract" "tesseract"
} }

51
src/adapters/writing.rs Normal file
View File

@ -0,0 +1,51 @@
use super::{FileAdapter, GetMetadata, ReadBox};
use anyhow::Result;
use std::io::Write;
#[dyn_clonable::clonable]
pub trait WritingFileAdapterTrait: GetMetadata + Send + Clone {
fn adapt_write(
&self,
a: super::AdaptInfo,
detection_reason: &crate::matching::SlowMatcher,
oup: &mut dyn Write,
) -> Result<()>;
}
pub struct WritingFileAdapter {
inner: Box<dyn WritingFileAdapterTrait>,
}
impl WritingFileAdapter {
pub fn new(inner: Box<dyn WritingFileAdapterTrait>) -> WritingFileAdapter {
WritingFileAdapter { inner }
}
}
impl GetMetadata for WritingFileAdapter {
fn metadata(&self) -> &super::AdapterMeta {
self.inner.metadata()
}
}
impl FileAdapter for WritingFileAdapter {
fn adapt(
&self,
a: super::AdaptInfo,
detection_reason: &crate::matching::SlowMatcher,
) -> anyhow::Result<ReadBox> {
let (r, w) = crate::pipe::pipe();
let cc = self.inner.clone();
let detc = detection_reason.clone();
std::thread::spawn(move || {
let mut oup = w;
let ai = a;
let res = cc.adapt_write(ai, &detc, &mut oup);
if let Err(e) = res {
oup.write_err(std::io::Error::new(std::io::ErrorKind::Other, e))
.expect("could not write err");
}
});
Ok(Box::new(r))
}
}

View File

@ -85,7 +85,7 @@ impl FromStr for CacheMaxBlobLen {
/// ///
/// 1. describing the command line arguments using structopt+clap and for man page / readme generation /// 1. describing the command line arguments using structopt+clap and for man page / readme generation
/// 2. describing the config file format (output as JSON schema via schemars) /// 2. describing the config file format (output as JSON schema via schemars)
#[derive(StructOpt, Debug, Deserialize, Serialize, JsonSchema, Default)] #[derive(StructOpt, Debug, Deserialize, Serialize, JsonSchema, Default, Clone)]
#[structopt( #[structopt(
name = "ripgrep-all", name = "ripgrep-all",
rename_all = "kebab-case", rename_all = "kebab-case",

View File

@ -16,7 +16,7 @@ fn main() -> anyhow::Result<()> {
std::env::current_dir()?.join(&filepath) std::env::current_dir()?.join(&filepath)
}; };
let mut i = File::open(&path)?; let i = File::open(&path)?;
let mut o = std::io::stdout(); let mut o = std::io::stdout();
let cache = if args.no_cache { let cache = if args.no_cache {
None None
@ -24,14 +24,14 @@ fn main() -> anyhow::Result<()> {
Some(rga::preproc_cache::open().context("could not open cache")?) Some(rga::preproc_cache::open().context("could not open cache")?)
}; };
let ai = AdaptInfo { let ai = AdaptInfo {
inp: &mut i, inp: Box::new(i),
filepath_hint: &path, filepath_hint: path,
is_real_file: true, is_real_file: true,
oup: &mut o, line_prefix: "".to_string(),
line_prefix: "",
archive_recursion_depth: 0, archive_recursion_depth: 0,
config: PreprocConfig { cache, args: &args }, config: PreprocConfig { cache, args },
}; };
rga_preproc(ai)?; let mut oup = rga_preproc(ai)?;
std::io::copy(&mut oup, &mut o).context("copying adapter output to stdout")?;
Ok(()) Ok(())
} }

View File

@ -1,11 +1,16 @@
#![warn(clippy::all)] #![warn(clippy::all)]
#![feature(negative_impls)]
#![feature(specialization)]
pub mod adapters; pub mod adapters;
pub mod args; pub mod args;
mod caching_writer; mod caching_writer;
pub mod matching; pub mod matching;
pub mod pipe;
pub mod preproc; pub mod preproc;
pub mod preproc_cache; pub mod preproc_cache;
#[cfg(test)]
pub mod test_utils;
use anyhow::Context; use anyhow::Context;
use anyhow::Result; use anyhow::Result;
pub use caching_writer::CachingWriter; pub use caching_writer::CachingWriter;

View File

@ -33,6 +33,12 @@ pub enum SlowMatcher {
MimeType(String), MimeType(String),
} }
impl From<FastMatcher> for SlowMatcher {
fn from(t: FastMatcher) -> Self {
SlowMatcher::Fast(t)
}
}
pub struct FileMeta { pub struct FileMeta {
// filename is not actually a utf8 string, but since we can't do regex on OsStr and can't get a &[u8] from OsStr either, // filename is not actually a utf8 string, but since we can't do regex on OsStr and can't get a &[u8] from OsStr either,
// and since we probably only want to do only matching on ascii stuff anyways, this is the filename as a string with non-valid bytes removed // and since we probably only want to do only matching on ascii stuff anyways, this is the filename as a string with non-valid bytes removed

196
src/pipe.rs Normal file
View File

@ -0,0 +1,196 @@
// https://github.com/arcnmx/pipe-rs/blob/master/src/lib.rs
// extended to support sending io errors
#![deny(missing_docs)]
#![doc(html_root_url = "https://docs.rs/pipe/0.3.0")]
#![cfg_attr(feature = "unstable-doc-cfg", feature(doc_cfg))]
//! Synchronous in-memory pipe
//!
//! ## Example
//!
//! ```
//! use std::thread::spawn;
//! use std::io::{Read, Write};
//!
//! let (mut read, mut write) = ripgrep_all::pipe::pipe();
//!
//! let message = "Hello, world!";
//! spawn(move || write.write_all(message.as_bytes()).unwrap());
//!
//! let mut s = String::new();
//! read.read_to_string(&mut s).unwrap();
//!
//! assert_eq!(&s, message);
//! ```
use crossbeam_channel::{Receiver, Sender};
use std::cmp::min;
use std::io::{self, BufRead, Read, Result, Write};
/// The `Read` end of a pipe (see `pipe()`)
pub struct PipeReader {
receiver: Receiver<Result<Vec<u8>>>,
buffer: Vec<u8>,
position: usize,
}
/// The `Write` end of a pipe (see `pipe()`)
#[derive(Clone)]
pub struct PipeWriter {
sender: Sender<Result<Vec<u8>>>,
}
/// Creates a synchronous memory pipe
pub fn pipe() -> (PipeReader, PipeWriter) {
let (sender, receiver) = crossbeam_channel::bounded(0);
(
PipeReader {
receiver,
buffer: Vec::new(),
position: 0,
},
PipeWriter { sender },
)
}
impl PipeWriter {
/// Extracts the inner `SyncSender` from the writer
pub fn into_inner(self) -> Sender<Result<Vec<u8>>> {
self.sender
}
/// Write any error into the pipe, will be handled as an IO error
pub fn write_err(&self, e: std::io::Error) -> Result<()> {
self.sender
.send(Err(e))
.map_err(|_| io::Error::new(io::ErrorKind::BrokenPipe, "pipe reader has been dropped"))
}
}
impl PipeReader {
/// Extracts the inner `Receiver` from the writer, and any pending buffered data
pub fn into_inner(mut self) -> (Receiver<Result<Vec<u8>>>, Vec<u8>) {
self.buffer.drain(..self.position);
(self.receiver, self.buffer)
}
}
impl BufRead for PipeReader {
fn fill_buf(&mut self) -> io::Result<&[u8]> {
while self.position >= self.buffer.len() {
match self.receiver.recv() {
// The only existing error is EOF
Err(_) => break,
Ok(Err(e)) => Err(e)?,
Ok(Ok(data)) => {
self.buffer = data;
self.position = 0;
}
}
}
Ok(&self.buffer[self.position..])
}
fn consume(&mut self, amt: usize) {
debug_assert!(self.buffer.len() - self.position >= amt);
self.position += amt
}
}
impl Read for PipeReader {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
if buf.is_empty() {
return Ok(0);
}
let internal = self.fill_buf()?;
let len = min(buf.len(), internal.len());
if len > 0 {
buf[..len].copy_from_slice(&internal[..len]);
self.consume(len);
}
Ok(len)
}
}
impl Write for PipeWriter {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
let data = buf.to_vec();
self.sender
.send(Ok(data))
.map(|_| buf.len())
.map_err(|_| io::Error::new(io::ErrorKind::BrokenPipe, "pipe reader has been dropped"))
}
fn flush(&mut self) -> io::Result<()> {
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::{Read, Write};
use std::thread::spawn;
#[test]
fn pipe_reader() {
let i = b"hello there";
let mut o = Vec::with_capacity(i.len());
let (mut r, mut w) = pipe();
let guard = spawn(move || {
w.write_all(&i[..5]).unwrap();
w.write_all(&i[5..]).unwrap();
drop(w);
});
r.read_to_end(&mut o).unwrap();
assert_eq!(i, &o[..]);
guard.join().unwrap();
}
#[test]
fn pipe_writer_fail() {
let i = b"hi";
let (r, mut w) = pipe();
let guard = spawn(move || {
drop(r);
});
assert!(w.write_all(i).is_err());
guard.join().unwrap();
}
#[test]
fn small_reads() {
let block_cnt = 20;
const BLOCK: usize = 20;
let (mut r, mut w) = pipe();
let guard = spawn(move || {
for _ in 0..block_cnt {
let data = &[0; BLOCK];
w.write_all(data).unwrap();
}
});
let mut buff = [0; BLOCK / 2];
let mut read = 0;
while let Ok(size) = r.read(&mut buff) {
// 0 means EOF
if size == 0 {
break;
}
read += size;
}
assert_eq!(block_cnt * BLOCK, read);
guard.join().unwrap();
}
}

View File

@ -4,20 +4,20 @@ use crate::matching::*;
use crate::{print_bytes, print_dur, CachingWriter}; use crate::{print_bytes, print_dur, CachingWriter};
use anyhow::*; use anyhow::*;
use log::*; use log::*;
use path_clean::PathClean;
use std::convert::TryInto;
use std::io::BufRead;
use std::io::BufReader; use std::io::BufReader;
use std::io::BufWriter;
use std::{ use std::{
sync::{Arc, RwLock}, sync::{Arc, RwLock},
time::Instant, time::Instant,
}; };
#[derive(Clone)] #[derive(Clone)]
pub struct PreprocConfig<'a> { pub struct PreprocConfig {
pub cache: Option<Arc<RwLock<dyn crate::preproc_cache::PreprocCache>>>, pub cache: Option<Arc<RwLock<dyn crate::preproc_cache::PreprocCache>>>,
pub args: &'a RgaConfig, pub args: RgaConfig,
} }
/** /**
* preprocess a file as defined in `ai`. * preprocess a file as defined in `ai`.
@ -25,19 +25,18 @@ pub struct PreprocConfig<'a> {
* If a cache is passed, read/write to it. * If a cache is passed, read/write to it.
* *
*/ */
pub fn rga_preproc(ai: AdaptInfo) -> Result<()> { pub fn rga_preproc(ai: AdaptInfo) -> Result<ReadBox> {
let AdaptInfo { let AdaptInfo {
filepath_hint, filepath_hint,
is_real_file, is_real_file,
inp, inp,
oup,
line_prefix, line_prefix,
config, config,
archive_recursion_depth, archive_recursion_depth,
.. ..
} = ai; } = ai;
debug!("path (hint) to preprocess: {:?}", filepath_hint); debug!("path (hint) to preprocess: {:?}", filepath_hint);
let PreprocConfig { mut cache, args } = config; let PreprocConfig { cache: _, args } = config;
let filtered_adapters = get_adapters_filtered(args.custom_adapters.clone(), &args.adapters)?; let filtered_adapters = get_adapters_filtered(args.custom_adapters.clone(), &args.adapters)?;
let adapters = adapter_matcher(&filtered_adapters, args.accurate)?; let adapters = adapter_matcher(&filtered_adapters, args.accurate)?;
let filename = filepath_hint let filename = filepath_hint
@ -45,22 +44,22 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<()> {
.ok_or_else(|| format_err!("Empty filename"))?; .ok_or_else(|| format_err!("Empty filename"))?;
debug!("Archive recursion depth: {}", archive_recursion_depth); debug!("Archive recursion depth: {}", archive_recursion_depth);
if archive_recursion_depth >= args.max_archive_recursion.0 { if archive_recursion_depth >= args.max_archive_recursion.0 {
writeln!(oup, "{}[rga: max archive recursion reached]", line_prefix)?; let s = format!("{}[rga: max archive recursion reached]", line_prefix).into_bytes();
return Ok(()); return Ok(Box::new(std::io::Cursor::new(s)));
} }
// todo: figure out when using a bufreader is a good idea and when it is not // todo: figure out when using a bufreader is a good idea and when it is not
// seems to be good for File::open() reads, but not sure about within archives (tar, zip) // seems to be good for File::open() reads, but not sure about within archives (tar, zip)
let inp = &mut BufReader::with_capacity(1 << 13, inp); let inp = BufReader::with_capacity(1 << 16, inp);
let mimetype = if args.accurate { let mimetype = None; /*if args.accurate {
let buf = inp.fill_buf()?; // fill but do not consume! let buf = inp.fill_buf()?; // fill but do not consume!
let mimetype = tree_magic::from_u8(buf); let mimetype = tree_magic::from_u8(buf);
debug!("mimetype: {:?}", mimetype); debug!("mimetype: {:?}", mimetype);
Some(mimetype) Some(mimetype)
} else { } else {
None None
}; };*/
let adapter = adapters(FileMeta { let adapter = adapters(FileMeta {
mimetype, mimetype,
lossy_filename: filename.to_string_lossy().to_string(), lossy_filename: filename.to_string_lossy().to_string(),
@ -77,8 +76,8 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<()> {
filepath_hint.to_string_lossy(), filepath_hint.to_string_lossy(),
&meta.name &meta.name
); );
let db_name = format!("{}.v{}", meta.name, meta.version); let _db_name = format!("{}.v{}", meta.name, meta.version);
if let Some(cache) = cache.as_mut() { /*if let Some(cache) = cache.as_mut() {
let cache_key: Vec<u8> = { let cache_key: Vec<u8> = {
let clean_path = filepath_hint.to_owned().clean(); let clean_path = filepath_hint.to_owned().clean();
let meta = std::fs::metadata(&filepath_hint)?; let meta = std::fs::metadata(&filepath_hint)?;
@ -160,18 +159,17 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<()> {
}), }),
)?; )?;
Ok(()) Ok(())
} else { } else { */
// no cache arc - probably within archive // no cache arc - probably within archive
debug!("adapting without caching..."); debug!("adapting without caching...");
let start = Instant::now(); let start = Instant::now();
adapter let oread = adapter
.adapt( .adapt(
AdaptInfo { AdaptInfo {
line_prefix, line_prefix,
filepath_hint, filepath_hint: filepath_hint.clone(),
is_real_file, is_real_file,
inp, inp: Box::new(inp),
oup,
archive_recursion_depth, archive_recursion_depth,
config: PreprocConfig { cache: None, args }, config: PreprocConfig { cache: None, args },
}, },
@ -189,16 +187,15 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<()> {
adapter.metadata().name, adapter.metadata().name,
print_dur(start) print_dur(start)
); );
Ok(()) Ok(oread)
} /* }*/
} }
None => { None => {
// allow passthrough if the file is in an archive or accurate matching is enabled // allow passthrough if the file is in an archive or accurate matching is enabled
// otherwise it should have been filtered out by rg pre-glob since rg can handle those better than us // otherwise it should have been filtered out by rg pre-glob since rg can handle those better than us
let allow_cat = !is_real_file || args.accurate; let allow_cat = !is_real_file || args.accurate;
if allow_cat { if allow_cat {
spawning::postproc_line_prefix(line_prefix, inp, oup)?; Ok(Box::new(inp))
Ok(())
} else { } else {
Err(format_err!( Err(format_err!(
"No adapter found for file {:?}, passthrough disabled.", "No adapter found for file {:?}, passthrough disabled.",

View File

@ -10,7 +10,7 @@ use std::{
pub fn open() -> Result<Arc<RwLock<dyn PreprocCache>>> { pub fn open() -> Result<Arc<RwLock<dyn PreprocCache>>> {
Ok(Arc::new(RwLock::new(LmdbCache::open()?))) Ok(Arc::new(RwLock::new(LmdbCache::open()?)))
} }
pub trait PreprocCache { pub trait PreprocCache: Send + Sync {
// possible without second lambda? // possible without second lambda?
fn get_or_run<'a>( fn get_or_run<'a>(
&mut self, &mut self,

33
src/test_utils.rs Normal file
View File

@ -0,0 +1,33 @@
use crate::{
adapters::{AdaptInfo, ReadBox},
args::RgaConfig,
matching::{FastMatcher, SlowMatcher},
preproc::PreprocConfig,
};
use std::{
path::{Path, PathBuf},
};
pub fn test_data_dir() -> PathBuf {
let mut d = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
d.push("exampledir/test/");
d
}
pub fn simple_adapt_info(filepath: &Path, inp: ReadBox) -> (AdaptInfo, SlowMatcher) {
(
AdaptInfo {
filepath_hint: filepath.to_owned(),
is_real_file: true,
archive_recursion_depth: 0,
inp,
line_prefix: "PREFIX:".to_string(),
config: PreprocConfig {
cache: None,
args: RgaConfig::default(),
},
},
FastMatcher::FileExtension(filepath.extension().unwrap().to_string_lossy().into_owned())
.into(),
)
}