finally fix tar

This commit is contained in:
phiresky 2019-06-06 23:19:59 +02:00
parent 83114f66bf
commit 83b804bef2
8 changed files with 97 additions and 33 deletions

24
Cargo.lock generated
View File

@ -131,6 +131,16 @@ name = "cfg-if"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "chrono"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"num-integer 0.1.41 (registry+https://github.com/rust-lang/crates.io-index)",
"num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
"time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "clap"
version = "2.33.0"
@ -483,6 +493,15 @@ name = "nom"
version = "2.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "num-integer"
version = "0.1.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"autocfg 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
"num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "num-traits"
version = "0.2.8"
@ -752,11 +771,12 @@ dependencies = [
[[package]]
name = "rga"
version = "0.1.0"
version = "0.2.0"
dependencies = [
"bincode 1.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
"bzip2 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
"cachedir 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
"chrono 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
"crossbeam 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)",
"env_logger 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
@ -1127,6 +1147,7 @@ dependencies = [
"checksum cachedir 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c06509d1f4ffa658939bd23f076cd929ef218241363796551528e7eec69128c8"
"checksum cc 1.0.37 (registry+https://github.com/rust-lang/crates.io-index)" = "39f75544d7bbaf57560d2168f28fd649ff9c76153874db88bdbdfd839b1a7e7d"
"checksum cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "b486ce3ccf7ffd79fdeb678eac06a9e6c09fc88d33836340becb8fffe87c5e33"
"checksum chrono 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "45912881121cb26fad7c38c17ba7daa18764771836b34fab7d3fbd93ed633878"
"checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9"
"checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
"checksum crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb"
@ -1169,6 +1190,7 @@ dependencies = [
"checksum miniz_oxide_c_api 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b7fe927a42e3807ef71defb191dc87d4e24479b221e67015fe38ae2b7b447bab"
"checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945"
"checksum nom 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cf51a729ecf40266a2368ad335a5fdde43471f545a967109cd62146ecf8b66ff"
"checksum num-integer 0.1.41 (registry+https://github.com/rust-lang/crates.io-index)" = "b85e541ef8255f6cf42bbfe4ef361305c6c135d10919ecc26126c4e5ae94bc09"
"checksum num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "6ba9a427cfca2be13aa6f6403b0b7e7368fe982bfa16fccc450ce74c46cd9b32"
"checksum num_cpus 1.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1a23f0ed30a54abaa0c7e83b1d2d87ada7c3c23078d1d87815af3e3b6385fbba"
"checksum numtoa 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b8f8bdf33df195859076e54ab11ee78a1b208382d3a26ec40d142ffc1ecc49ef"

View File

@ -5,7 +5,7 @@ cargo-features = ["default-run"]
name = "rga"
description = "ripgrep, except for pdf, ebooks, Office documents, etc"
license = "AGPL-3.0-or-later"
version = "0.1.0"
version = "0.2.0"
repository = "https://github.com/phiresky/rga"
authors = ["phiresky <phireskyde+git@gmail.com>"]
edition = "2018"
@ -37,3 +37,4 @@ xz2 = "0.1.6"
flate2 = "1.0.7"
bzip2 = "0.3.3"
tar = "0.4.26"
chrono = "0.4.6"

View File

@ -3,6 +3,10 @@ similar:
- pdfgrep
- https://gist.github.com/ColonolBuendia/314826e37ec35c616d70506c38dc65aa
# todo
- jpg adapter (based on object classification / detection (yolo?)) for fun
# considerations
- matching on mime (magic bytes) instead of filename

BIN
exampledir/test.tar Normal file

Binary file not shown.

Binary file not shown.

View File

@ -4,6 +4,7 @@ use ::tar::EntryType::Regular;
use failure::*;
use lazy_static::lazy_static;
use std::fs::File;
use std::io::BufReader;
use std::path::PathBuf;
static EXTENSIONS: &[&str] = &["tar", "tar.gz", "tar.bz2", "tar.xz", "tar.zst"];
@ -31,30 +32,59 @@ impl GetMetadata for TarAdapter {
&METADATA
}
}
/*struct WrapRead<'a> {
inner: &mut 'a Read;
}
impl Read for WrapRead {
r
}*/
/*fn decompress_any(filename: &Path, inp: &mut Read) -> Fallible<Box<Read>> {
// make a &mut Read into a owned Read because the streaming decompressors want to take ownership of their base Reads
struct WrapRead<'a> {
inner: &'a mut dyn Read,
}
impl<'a> Read for WrapRead<'a> {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
self.inner.read(buf)
}
}
// feeling a little stupid here. why is this needed at all
enum SpecRead<R: Read> {
Gz(flate2::read::MultiGzDecoder<R>),
Bz2(bzip2::read::BzDecoder<R>),
Xz(xz2::read::XzDecoder<R>),
Zst(zstd::stream::read::Decoder<BufReader<R>>),
Passthrough(R),
}
impl<R: Read> Read for SpecRead<R> {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
use SpecRead::*;
match self {
Gz(z) => z.read(buf),
Bz2(z) => z.read(buf),
Xz(z) => z.read(buf),
Zst(z) => z.read(buf),
Passthrough(z) => z.read(buf),
}
}
}
// why do I need to wrap the output here in a specific type? is it possible with just a Box<Read> for every type?
fn decompress_any<'a, R>(filename: &Path, inp: &'a mut R) -> Fallible<SpecRead<WrapRead<'a>>>
where
R: Read,
{
let inp = WrapRead { inner: inp };
let extension = filename.extension().map(|e| e.to_string_lossy().to_owned());
match extension {
Some(e) => Ok(match e.to_owned().as_ref() {
"gz" => Box::new(flate2::read::MultiGzDecoder::new(inp)),
"bz2" => Box::new(bzip2::read::BzDecoder::new(inp)),
"xz" => Box::new(xz2::read::XzDecoder::new_multi_decoder(inp)),
"zst" => Box::new(zstd::stream::read::Decoder::new(inp)?),
e => Err(format_err!("don't know how to decompress {}", e))?,
"gz" => SpecRead::Gz(flate2::read::MultiGzDecoder::new(inp)),
"bz2" => SpecRead::Bz2(bzip2::read::BzDecoder::new(inp)),
"xz" => SpecRead::Xz(xz2::read::XzDecoder::new_multi_decoder(inp)),
"zst" => SpecRead::Zst(zstd::stream::read::Decoder::new(inp)?),
"tar" => SpecRead::Passthrough(inp),
ext => Err(format_err!("don't know how to decompress {}", ext))?,
}),
None => Err(format_err!("no extension")),
}
}*/
}
impl FileAdapter for TarAdapter {
fn adapt<'a>(&self, ai: AdaptInfo) -> Fallible<()> {
use std::io::prelude::*;
fn adapt(&self, ai: AdaptInfo) -> Fallible<()> {
let AdaptInfo {
filepath_hint,
mut inp,
@ -62,7 +92,8 @@ impl FileAdapter for TarAdapter {
line_prefix,
..
} = ai;
let decompress = inp; //decompress_any(filepath_hint, &inp)?;
let decompress = decompress_any(filepath_hint, &mut inp)?;
let mut archive = ::tar::Archive::new(decompress);
for entry in archive.entries()? {
let mut file = entry.unwrap();
@ -75,15 +106,13 @@ impl FileAdapter for TarAdapter {
);
if Regular == file.header().entry_type() {
let line_prefix = &format!("{}{}: ", line_prefix, path.display());
rga_preproc(
AdaptInfo {
filepath_hint: &path,
inp: &mut file,
oup: oup,
line_prefix,
},
None,
)?;
let ai2: AdaptInfo = AdaptInfo {
filepath_hint: &path,
inp: &mut file,
oup: oup,
line_prefix,
};
rga_preproc(ai2, None)?;
}
}
Ok(())

View File

@ -24,9 +24,8 @@ fn main() -> Result<(), Error> {
};
let cache_db = match env::var("RGA_NO_CACHE") {
Ok(ref s) if s.len() > 0 => Some(open_cache_db()?),
Ok(_) => None,
Err(_) => None,
Ok(ref s) if s.len() > 0 => None,
Ok(_) | Err(_) => Some(open_cache_db()?),
};
rga_preproc(ai, cache_db)

View File

@ -2,7 +2,9 @@ use crate::adapters::*;
use crate::CachingWriter;
use failure::{format_err, Error};
use path_clean::PathClean;
use std::fs::File;
use std::io::Read;
use std::io::Write;
use std::path::Path;
use std::path::PathBuf;
use std::rc::Rc;
@ -21,16 +23,21 @@ pub fn open_cache_db() -> Result<std::sync::Arc<std::sync::RwLock<rkv::Rkv>>, Er
let mut builder = rkv::Rkv::environment_builder();
builder
.set_flags(rkv::EnvironmentFlags::NO_SYNC | rkv::EnvironmentFlags::WRITE_MAP) // not durable
// i'm not sure why this is needed. otherwise LMDB transactions (open readers) will keep piling up until it fails with
// LmdbError(ReadersFull)
// hope it doesn't break integrity
.set_flags(rkv::EnvironmentFlags::NO_TLS)
.set_map_size(2 * 1024 * 1024 * 1024)
.set_max_dbs(100);
.set_max_dbs(100)
.set_max_readers(128);
rkv::Rkv::from_env(p, builder)
})
.expect("could not get/create db");
Ok(db_arc)
}
pub fn rga_preproc(
ai: AdaptInfo,
pub fn rga_preproc<'a>(
ai: AdaptInfo<'a>,
mb_db_arc: Option<std::sync::Arc<std::sync::RwLock<rkv::Rkv>>>,
) -> Result<(), Error> {
let adapters = adapter_matcher()?;
@ -79,6 +86,7 @@ pub fn rga_preproc(
let db = db_env
.open_single(db_name.as_str(), rkv::store::Options::create())
.map_err(|p| format_err!("could not open db store: {:?}", p))?;
let reader = db_env.read().expect("could not get reader");
let cached = db
.get(&reader, &cache_key)
@ -91,6 +99,7 @@ pub fn rga_preproc(
}
Some(_) => Err(format_err!("Integrity: value not blob")),
None => {
drop(reader);
let mut compbuf = CachingWriter::new(oup, MAX_DB_BLOB_LEN, ZSTD_LEVEL)?;
// start dupe
eprintln!("adapting...");