prefixing postprocessor

This commit is contained in:
phiresky 2020-09-30 16:22:54 +02:00
parent cc744176ca
commit ab5ddcad2e
8 changed files with 236 additions and 144 deletions

View File

@ -1,7 +1,7 @@
//pub mod custom;
// pub mod custom;
// pub mod decompress;
// pub mod ffmpeg;
pub mod fns;
pub mod postproc;
// pub mod pdfpages;
// pub mod spawning;
// pub mod sqlite;
@ -19,11 +19,11 @@ use std::borrow::Cow;
use std::collections::HashMap;
use std::io::prelude::*;
use std::iter::Iterator;
use std::path::{Path, PathBuf};
use std::path::PathBuf;
use std::rc::Rc;
pub type ReadBox<'a> = Box<dyn Read + 'a>;
pub type ReadIterBox<'a> = Box<dyn ReadIter + 'a>;
pub struct AdapterMeta {
/// unique short name of this adapter (a-z0-9 only)
pub name: String,
@ -92,6 +92,20 @@ pub trait ReadIter {
fn next<'a>(&'a mut self) -> Option<AdaptInfo<'a>>;
}
pub struct SingleReadIter<'a> {
ai: Option<AdaptInfo<'a>>,
}
impl SingleReadIter<'_> {
pub fn new<'a>(ai: AdaptInfo<'a>) -> SingleReadIter<'a> {
SingleReadIter { ai: Some(ai) }
}
}
impl ReadIter for SingleReadIter<'_> {
fn next<'a>(&'a mut self) -> Option<AdaptInfo<'a>> {
self.ai.take()
}
}
pub struct AdaptInfo<'a> {
/// file path. May not be an actual file on the file system (e.g. in an archive). Used for matching file extensions.
pub filepath_hint: PathBuf,
@ -103,6 +117,7 @@ pub struct AdaptInfo<'a> {
pub inp: ReadBox<'a>,
/// prefix every output line with this string to better indicate the file's location if it is in some archive
pub line_prefix: String,
pub postprocess: bool,
pub config: RgaConfig,
}

View File

@ -3,11 +3,10 @@
//impl<T> FileAdapter for T where T: RunFnAdapter {}
use anyhow::Result;
use std::io::{BufRead, BufReader};
use std::{
cmp::min,
io::{Read, Write},
};
use std::{cmp::min, io::Read};
use super::{AdaptInfo, AdapterMeta, FileAdapter, GetMetadata, SingleReadIter};
struct ByteReplacer<R>
where
@ -75,18 +74,64 @@ where
}
}
pub fn postprocB(_line_prefix: &str, inp: impl Read) -> Result<impl Read> {
pub struct PostprocPrefix {}
impl GetMetadata for PostprocPrefix {
fn metadata(&self) -> &super::AdapterMeta {
lazy_static::lazy_static! {
static ref METADATA: AdapterMeta = AdapterMeta {
name: "postprocprefix".to_owned(),
version: 1,
description: "Adds the line prefix to each line".to_owned(),
recurses: true,
fast_matchers: vec![],
slow_matchers: None,
keep_fast_matchers_if_accurate: false,
disabled_by_default: false
};
}
&METADATA
}
}
impl FileAdapter for PostprocPrefix {
fn adapt<'a>(
&self,
a: super::AdaptInfo<'a>,
_detection_reason: &crate::matching::FileMatcher,
) -> Result<Box<dyn super::ReadIter + 'a>> {
let read = postproc_prefix(&a.line_prefix, a.inp)?;
// keep adapt info (filename etc) except replace inp
let ai = AdaptInfo {
inp: Box::new(read),
postprocess: false,
..a
};
Ok(Box::new(SingleReadIter::new(ai)))
}
}
pub fn postproc_prefix(line_prefix: &str, inp: impl Read) -> Result<impl Read> {
let line_prefix = line_prefix.to_string(); // clone since we need it later
Ok(ByteReplacer {
inner: inp,
next_read: format!("{}", line_prefix).into_bytes(),
haystacker: Box::new(|buf| memchr::memchr(b'\n', buf)),
replacer: Box::new(move |_| format!("\n{}", line_prefix).into_bytes()),
})
}
pub fn postproc_pagebreaks(line_prefix: &str, inp: impl Read) -> Result<impl Read> {
let line_prefix = line_prefix.to_string(); // clone since
let mut page_count = 1;
Ok(ByteReplacer {
inner: inp,
next_read: Vec::new(),
next_read: format!("{}Page {}:", line_prefix, page_count).into_bytes(),
haystacker: Box::new(|buf| memchr::memchr2(b'\n', b'\x0c', buf)),
replacer: Box::new(move |b| match b {
b'\n' => format!("\nPage {}:", page_count).into_bytes(),
b'\n' => format!("\n{}Page {}:", line_prefix, page_count).into_bytes(),
b'\x0c' => {
page_count += 1;
format!("\nPage {}:", page_count).into_bytes()
format!("\n{}Page {}:", line_prefix, page_count).into_bytes()
}
_ => b"[[imposs]]".to_vec(),
}),
@ -95,13 +140,13 @@ pub fn postprocB(_line_prefix: &str, inp: impl Read) -> Result<impl Read> {
#[cfg(test)]
mod tests {
use super::postprocB;
use super::postproc_pagebreaks;
use anyhow::Result;
use std::io::Read;
fn test_from_strs(a: &str, b: &str) -> Result<()> {
let mut oup = Vec::new();
postprocB("", a.as_bytes())?.read_to_end(&mut oup)?;
postproc_pagebreaks("", a.as_bytes())?.read_to_end(&mut oup)?;
let c = String::from_utf8_lossy(&oup);
if b != c {
anyhow::bail!("{}\nshould be\n{}\nbut is\n{}", a, b, c);
@ -113,14 +158,14 @@ mod tests {
#[test]
fn post1() -> Result<()> {
let inp = "What is this\nThis is a test\nFoo";
let oup = "What is this\nPage 1:This is a test\nPage 1:Foo";
let oup = "Page 1:What is this\nPage 1:This is a test\nPage 1:Foo";
test_from_strs(inp, oup)?;
println!("\n\n\n\n");
let inp = "What is this\nThis is a test\nFoo\x0c\nHelloooo\nHow are you?\x0c\nGreat!";
let oup = "What is this\nPage 1:This is a test\nPage 1:Foo\nPage 2:\nPage 2:Helloooo\nPage 2:How are you?\nPage 3:\nPage 3:Great!";
let oup = "Page 1:What is this\nPage 1:This is a test\nPage 1:Foo\nPage 2:\nPage 2:Helloooo\nPage 2:How are you?\nPage 3:\nPage 3:Great!";
test_from_strs(inp, oup)?;

View File

@ -1,13 +1,9 @@
use super::*;
use crate::{preproc::rga_preproc, print_bytes};
use ::zip::read::ZipFile;
use crate::print_bytes;
use anyhow::*;
use lazy_static::lazy_static;
use log::*;
// todo:
// maybe todo: read list of extensions from
//ffmpeg -demuxers | tail -n+5 | awk '{print $2}' | while read demuxer; do echo MUX=$demuxer; ffmpeg -h demuxer=$demuxer | grep 'Common extensions'; done 2>/dev/null
static EXTENSIONS: &[&str] = &["zip"];
lazy_static! {
@ -39,27 +35,19 @@ impl GetMetadata for ZipAdapter {
}
}
// https://github.com/mvdnes/zip-rs/commit/b9af51e654793931af39f221f143b9dea524f349
fn is_dir(f: &ZipFile) -> bool {
f.name()
.chars()
.rev()
.next()
.map_or(false, |c| c == '/' || c == '\\')
}
struct OutIter<'a> {
struct ZipAdaptIter<'a> {
inp: AdaptInfo<'a>,
}
impl<'a> ReadIter for OutIter<'a> {
impl<'a> ReadIter for ZipAdaptIter<'a> {
fn next<'b>(&'b mut self) -> Option<AdaptInfo<'b>> {
let line_prefix = "todo";
let filepath_hint = std::path::PathBuf::from("hello");
let line_prefix = &self.inp.line_prefix;
let filepath_hint = &self.inp.filepath_hint;
let archive_recursion_depth = 1;
let postprocess = self.inp.postprocess;
::zip::read::read_zipfile_from_stream(&mut self.inp.inp)
.unwrap()
.and_then(|file| {
if is_dir(&file) {
if file.is_dir() {
return None;
}
debug!(
@ -72,11 +60,12 @@ impl<'a> ReadIter for OutIter<'a> {
);
let line_prefix = format!("{}{}: ", line_prefix, file.name());
Some(AdaptInfo {
filepath_hint: file.sanitized_name().clone(),
filepath_hint: PathBuf::from(file.name()),
is_real_file: false,
inp: Box::new(file),
line_prefix,
archive_recursion_depth: archive_recursion_depth + 1,
postprocess,
config: RgaConfig::default(), //config.clone(),
})
})
@ -86,41 +75,56 @@ impl<'a> ReadIter for OutIter<'a> {
impl FileAdapter for ZipAdapter {
fn adapt<'a>(
&self,
ai: AdaptInfo<'a>,
detection_reason: &FileMatcher,
inp: AdaptInfo<'a>,
_detection_reason: &FileMatcher,
) -> Result<Box<dyn ReadIter + 'a>> {
Ok(Box::new(OutIter { inp: ai }))
/*loop {
match ::zip::read::read_zipfile_from_stream(&mut inp) {
Ok(None) => break,
Ok(Some(mut file)) => {
if is_dir(&file) {
continue;
}
debug!(
"{}{}|{}: {} ({} packed)",
line_prefix,
filepath_hint.to_string_lossy(),
file.name(),
print_bytes(file.size() as f64),
print_bytes(file.compressed_size() as f64)
);
let line_prefix = format!("{}{}: ", line_prefix, file.name());
let mut rd = rga_preproc(AdaptInfo {
filepath_hint: file.sanitized_name().clone(),
is_real_file: false,
inp: &mut file,
line_prefix,
archive_recursion_depth: archive_recursion_depth + 1,
config: config.clone(),
})?;
// copy read stream from inner file to output
std::io::copy(&mut rd, oup);
drop(rd);
}
Err(e) => return Err(e.into()),
}
}
Ok(())*/
Ok(Box::new(ZipAdaptIter { inp }))
}
}
#[cfg(test)]
mod test {
use super::*;
use crate::{recurse::RecursingConcattyReader, test_utils::*};
fn create_zip(fname: &str, content: &str, add_inner: bool) -> Result<Vec<u8>> {
use ::zip::write::FileOptions;
use std::io::Write;
// We use a buffer here, though you'd normally use a `File`
let mut zip = ::zip::ZipWriter::new(std::io::Cursor::new(Vec::new()));
let options = FileOptions::default().compression_method(::zip::CompressionMethod::Stored);
zip.start_file(fname, options)?;
zip.write(content.as_bytes())?;
if add_inner {
zip.start_file("inner.zip", options)?;
zip.write(&create_zip("inner.txt", "inner text file", false)?)?;
}
// Apply the changes you've made.
// Dropping the `ZipWriter` will have the same effect, but may silently fail
Ok(zip.finish()?.into_inner())
}
#[test]
fn recurse() -> Result<()> {
let zipfile = create_zip("outer.txt", "outer text file", true)?;
let adapter: Box<dyn FileAdapter> = Box::new(ZipAdapter::new());
let (a, d) = simple_adapt_info(
&PathBuf::from("outer.zip"),
Box::new(std::io::Cursor::new(zipfile)),
);
let mut res = RecursingConcattyReader::concat(adapter.adapt(a, &d)?);
let mut buf = Vec::new();
res.read_to_end(&mut buf)?;
assert_eq!(
String::from_utf8(buf)?,
"PREFIX:outer.txt:outer text file\n",
);
Ok(())
}
}

View File

@ -26,6 +26,7 @@ fn main() -> anyhow::Result<()> {
is_real_file: true,
line_prefix: "".to_string(),
archive_recursion_depth: 0,
postprocess: true,
config,
};

View File

@ -9,6 +9,7 @@ pub mod matching;
pub mod pipe;
pub mod preproc;
pub mod preproc_cache;
pub mod recurse;
#[cfg(test)]
pub mod test_utils;
use anyhow::Context;

View File

@ -1,14 +1,14 @@
use crate::adapters::*;
use crate::matching::*;
use crate::{matching::*, recurse::RecursingConcattyReader};
use crate::{
preproc_cache::{LmdbCache, PreprocCache},
print_bytes, print_dur, CachingReader,
};
use anyhow::*;
use log::*;
use owning_ref::OwningRefMut;
use path_clean::PathClean;
use std::{convert::TryInto, io::Read};
use postproc::PostprocPrefix;
use std::convert::TryInto;
use std::io::{BufRead, BufReader};
@ -27,7 +27,7 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<ReadBox> {
line_prefix,
config,
archive_recursion_depth,
..
postprocess,
} = ai;
debug!("path (hint) to preprocess: {:?}", filepath_hint);
let filtered_adapters =
@ -58,76 +58,45 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<ReadBox> {
mimetype,
lossy_filename: filename.to_string_lossy().to_string(),
});
match adapter {
Some((adapter, detection_reason)) => run_adapter(
AdaptInfo {
filepath_hint,
is_real_file,
inp: Box::new(inp),
line_prefix,
config,
archive_recursion_depth,
},
adapter,
detection_reason,
&filtered_adapters,
),
let (adapter, detection_reason) = match adapter {
Some((a, d)) => (a, d),
None => {
// allow passthrough if the file is in an archive or accurate matching is enabled
// otherwise it should have been filtered out by rg pre-glob since rg can handle those better than us
let allow_cat = !is_real_file || config.accurate;
if allow_cat {
Ok(Box::new(inp))
if postprocess {
(
Rc::new(PostprocPrefix {}) as Rc<dyn FileAdapter>,
FileMatcher::Fast(FastFileMatcher::FileExtension("default".to_string())), // todo: separate enum value for this
)
} else {
return Ok(Box::new(inp));
}
} else {
Err(format_err!(
return Err(format_err!(
"No adapter found for file {:?}, passthrough disabled.",
filename
))
));
}
}
}
}
struct ConcattyReader<'a> {
inp: Box<dyn ReadIter + 'a>,
cur: Option<AdaptInfo<'a>>,
}
impl<'a> ConcattyReader<'a> {
fn ascend(&mut self) {
self.cur = unsafe {
// would love to make this safe, but how?
let r: *mut Box<dyn ReadIter + 'a> = &mut self.inp;
(*r).next()
};
eprintln!(
"ascended to {}",
self.cur
.as_ref()
.map(|e| e.filepath_hint.to_string_lossy().into_owned())
.unwrap_or("END".to_string())
);
}
}
impl<'a> Read for ConcattyReader<'a> {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
match &mut self.cur {
None => Ok(0), // last file ended
Some(cur) => match cur.inp.read(buf) {
Err(e) => Err(e),
Ok(0) => {
// current file ended, go to next file
self.ascend();
self.read(buf)
}
Ok(n) => Ok(n),
},
}
}
}
fn concattyreader<'a>(inp: Box<dyn ReadIter + 'a>) -> Box<dyn Read + 'a> {
let mut r = ConcattyReader { inp, cur: None };
r.ascend();
Box::new(r)
};
let path_hint_copy = filepath_hint.clone();
run_adapter(
AdaptInfo {
filepath_hint,
is_real_file,
inp: Box::new(inp),
line_prefix,
config,
archive_recursion_depth,
postprocess,
},
adapter,
detection_reason,
&filtered_adapters,
)
.with_context(|| format!("run_adapter({})", &path_hint_copy.to_string_lossy()))
}
fn run_adapter<'a>(
@ -143,7 +112,7 @@ fn run_adapter<'a>(
line_prefix,
config,
archive_recursion_depth,
..
postprocess,
} = ai;
let meta = adapter.metadata();
debug!(
@ -159,10 +128,18 @@ fn run_adapter<'a>(
let cache_compression_level = config.cache.compression_level;
let cache_max_blob_len = config.cache.max_blob_len;
if let Some(mut cache) = LmdbCache::open(&config.cache)? {
let cache = if is_real_file {
LmdbCache::open(&config.cache)?
} else {
None
};
if let Some(mut cache) = cache {
let cache_key: Vec<u8> = {
let clean_path = filepath_hint.to_owned().clean();
let meta = std::fs::metadata(&filepath_hint)?;
let meta = std::fs::metadata(&filepath_hint).with_context(|| {
format!("reading metadata for {}", filepath_hint.to_string_lossy())
})?;
let modified = meta.modified().expect("weird OS that can't into mtime");
if adapter.metadata().recurses {
@ -206,6 +183,7 @@ fn run_adapter<'a>(
inp: Box::new(inp),
archive_recursion_depth,
config,
postprocess,
},
&detection_reason,
)
@ -216,8 +194,7 @@ fn run_adapter<'a>(
meta.name
)
})?;
while let Some(innerinp) = inp.next() {}
/*let inp = concattyreader(inp);
let inp = RecursingConcattyReader::concat(inp)?;
let inp = CachingReader::new(
inp,
cache_max_blob_len.0.try_into().unwrap(),
@ -233,7 +210,7 @@ fn run_adapter<'a>(
}
Ok(())
}),
)?;*/
)?;
Ok(Box::new(inp))
}
@ -251,6 +228,7 @@ fn run_adapter<'a>(
inp,
archive_recursion_depth,
config,
postprocess,
},
&detection_reason,
)
@ -266,6 +244,6 @@ fn run_adapter<'a>(
adapter.metadata().name,
print_dur(start)
);
Ok(concattyreader(oread))
Ok(RecursingConcattyReader::concat(oread)?)
}
}

48
src/recurse.rs Normal file
View File

@ -0,0 +1,48 @@
use crate::adapters::*;
use crate::preproc::rga_preproc;
use anyhow::*;
use std::io::Read;
pub struct RecursingConcattyReader<'a> {
inp: Box<dyn ReadIter + 'a>,
cur: Option<ReadBox<'a>>,
}
impl<'a> RecursingConcattyReader<'a> {
pub fn concat(inp: Box<dyn ReadIter + 'a>) -> Result<Box<dyn Read + 'a>> {
let mut r = RecursingConcattyReader { inp, cur: None };
r.ascend()?;
Ok(Box::new(r))
}
pub fn ascend(&mut self) -> Result<()> {
let inp = &mut self.inp;
// get next inner file from inp
// we only need to access the inp: ReadIter when the inner reader is done, so this should be safe
let ai = unsafe {
// would love to make this safe, but how? something like OwnedRef<inp, cur>
(*(inp as *mut Box<dyn ReadIter + 'a>)).next()
};
self.cur = match ai {
Some(ai) => Some(rga_preproc(ai)?),
None => None,
};
Ok(())
}
}
impl<'a> Read for RecursingConcattyReader<'a> {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
match &mut self.cur {
None => Ok(0), // last file ended
Some(cur) => match cur.read(buf) {
Err(e) => Err(e),
Ok(0) => {
// current file ended, go to next file
self.ascend()
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
self.read(buf)
}
Ok(n) => Ok(n),
},
}
}
}

View File

@ -11,7 +11,7 @@ pub fn test_data_dir() -> PathBuf {
d
}
pub fn simple_adapt_info(filepath: &Path, inp: ReadBox) -> (AdaptInfo, FileMatcher) {
pub fn simple_adapt_info<'a>(filepath: &Path, inp: ReadBox<'a>) -> (AdaptInfo<'a>, FileMatcher) {
(
AdaptInfo {
filepath_hint: filepath.to_owned(),