finally fix tar

This commit is contained in:
phiresky 2019-06-06 23:19:59 +02:00
parent 83114f66bf
commit 83b804bef2
8 changed files with 97 additions and 33 deletions

24
Cargo.lock generated
View File

@ -131,6 +131,16 @@ name = "cfg-if"
version = "0.1.9" version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "chrono"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"num-integer 0.1.41 (registry+https://github.com/rust-lang/crates.io-index)",
"num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
"time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]] [[package]]
name = "clap" name = "clap"
version = "2.33.0" version = "2.33.0"
@ -483,6 +493,15 @@ name = "nom"
version = "2.2.1" version = "2.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "num-integer"
version = "0.1.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"autocfg 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
"num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]] [[package]]
name = "num-traits" name = "num-traits"
version = "0.2.8" version = "0.2.8"
@ -752,11 +771,12 @@ dependencies = [
[[package]] [[package]]
name = "rga" name = "rga"
version = "0.1.0" version = "0.2.0"
dependencies = [ dependencies = [
"bincode 1.1.4 (registry+https://github.com/rust-lang/crates.io-index)", "bincode 1.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
"bzip2 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", "bzip2 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
"cachedir 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", "cachedir 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
"chrono 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)", "clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
"crossbeam 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)", "crossbeam 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)",
"env_logger 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", "env_logger 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
@ -1127,6 +1147,7 @@ dependencies = [
"checksum cachedir 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c06509d1f4ffa658939bd23f076cd929ef218241363796551528e7eec69128c8" "checksum cachedir 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c06509d1f4ffa658939bd23f076cd929ef218241363796551528e7eec69128c8"
"checksum cc 1.0.37 (registry+https://github.com/rust-lang/crates.io-index)" = "39f75544d7bbaf57560d2168f28fd649ff9c76153874db88bdbdfd839b1a7e7d" "checksum cc 1.0.37 (registry+https://github.com/rust-lang/crates.io-index)" = "39f75544d7bbaf57560d2168f28fd649ff9c76153874db88bdbdfd839b1a7e7d"
"checksum cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "b486ce3ccf7ffd79fdeb678eac06a9e6c09fc88d33836340becb8fffe87c5e33" "checksum cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "b486ce3ccf7ffd79fdeb678eac06a9e6c09fc88d33836340becb8fffe87c5e33"
"checksum chrono 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "45912881121cb26fad7c38c17ba7daa18764771836b34fab7d3fbd93ed633878"
"checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9" "checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9"
"checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
"checksum crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb" "checksum crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb"
@ -1169,6 +1190,7 @@ dependencies = [
"checksum miniz_oxide_c_api 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b7fe927a42e3807ef71defb191dc87d4e24479b221e67015fe38ae2b7b447bab" "checksum miniz_oxide_c_api 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b7fe927a42e3807ef71defb191dc87d4e24479b221e67015fe38ae2b7b447bab"
"checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945" "checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945"
"checksum nom 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cf51a729ecf40266a2368ad335a5fdde43471f545a967109cd62146ecf8b66ff" "checksum nom 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cf51a729ecf40266a2368ad335a5fdde43471f545a967109cd62146ecf8b66ff"
"checksum num-integer 0.1.41 (registry+https://github.com/rust-lang/crates.io-index)" = "b85e541ef8255f6cf42bbfe4ef361305c6c135d10919ecc26126c4e5ae94bc09"
"checksum num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "6ba9a427cfca2be13aa6f6403b0b7e7368fe982bfa16fccc450ce74c46cd9b32" "checksum num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "6ba9a427cfca2be13aa6f6403b0b7e7368fe982bfa16fccc450ce74c46cd9b32"
"checksum num_cpus 1.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1a23f0ed30a54abaa0c7e83b1d2d87ada7c3c23078d1d87815af3e3b6385fbba" "checksum num_cpus 1.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1a23f0ed30a54abaa0c7e83b1d2d87ada7c3c23078d1d87815af3e3b6385fbba"
"checksum numtoa 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b8f8bdf33df195859076e54ab11ee78a1b208382d3a26ec40d142ffc1ecc49ef" "checksum numtoa 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b8f8bdf33df195859076e54ab11ee78a1b208382d3a26ec40d142ffc1ecc49ef"

View File

@ -5,7 +5,7 @@ cargo-features = ["default-run"]
name = "rga" name = "rga"
description = "ripgrep, except for pdf, ebooks, Office documents, etc" description = "ripgrep, except for pdf, ebooks, Office documents, etc"
license = "AGPL-3.0-or-later" license = "AGPL-3.0-or-later"
version = "0.1.0" version = "0.2.0"
repository = "https://github.com/phiresky/rga" repository = "https://github.com/phiresky/rga"
authors = ["phiresky <phireskyde+git@gmail.com>"] authors = ["phiresky <phireskyde+git@gmail.com>"]
edition = "2018" edition = "2018"
@ -37,3 +37,4 @@ xz2 = "0.1.6"
flate2 = "1.0.7" flate2 = "1.0.7"
bzip2 = "0.3.3" bzip2 = "0.3.3"
tar = "0.4.26" tar = "0.4.26"
chrono = "0.4.6"

View File

@ -3,6 +3,10 @@ similar:
- pdfgrep - pdfgrep
- https://gist.github.com/ColonolBuendia/314826e37ec35c616d70506c38dc65aa - https://gist.github.com/ColonolBuendia/314826e37ec35c616d70506c38dc65aa
# todo
- jpg adapter (based on object classification / detection (yolo?)) for fun
# considerations # considerations
- matching on mime (magic bytes) instead of filename - matching on mime (magic bytes) instead of filename

BIN
exampledir/test.tar Normal file

Binary file not shown.

Binary file not shown.

View File

@ -4,6 +4,7 @@ use ::tar::EntryType::Regular;
use failure::*; use failure::*;
use lazy_static::lazy_static; use lazy_static::lazy_static;
use std::fs::File; use std::fs::File;
use std::io::BufReader;
use std::path::PathBuf; use std::path::PathBuf;
static EXTENSIONS: &[&str] = &["tar", "tar.gz", "tar.bz2", "tar.xz", "tar.zst"]; static EXTENSIONS: &[&str] = &["tar", "tar.gz", "tar.bz2", "tar.xz", "tar.zst"];
@ -31,30 +32,59 @@ impl GetMetadata for TarAdapter {
&METADATA &METADATA
} }
} }
/*struct WrapRead<'a> {
inner: &mut 'a Read;
}
impl Read for WrapRead {
r
}*/
/*fn decompress_any(filename: &Path, inp: &mut Read) -> Fallible<Box<Read>> { // make a &mut Read into a owned Read because the streaming decompressors want to take ownership of their base Reads
struct WrapRead<'a> {
inner: &'a mut dyn Read,
}
impl<'a> Read for WrapRead<'a> {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
self.inner.read(buf)
}
}
// feeling a little stupid here. why is this needed at all
enum SpecRead<R: Read> {
Gz(flate2::read::MultiGzDecoder<R>),
Bz2(bzip2::read::BzDecoder<R>),
Xz(xz2::read::XzDecoder<R>),
Zst(zstd::stream::read::Decoder<BufReader<R>>),
Passthrough(R),
}
impl<R: Read> Read for SpecRead<R> {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
use SpecRead::*;
match self {
Gz(z) => z.read(buf),
Bz2(z) => z.read(buf),
Xz(z) => z.read(buf),
Zst(z) => z.read(buf),
Passthrough(z) => z.read(buf),
}
}
}
// why do I need to wrap the output here in a specific type? is it possible with just a Box<Read> for every type?
fn decompress_any<'a, R>(filename: &Path, inp: &'a mut R) -> Fallible<SpecRead<WrapRead<'a>>>
where
R: Read,
{
let inp = WrapRead { inner: inp };
let extension = filename.extension().map(|e| e.to_string_lossy().to_owned()); let extension = filename.extension().map(|e| e.to_string_lossy().to_owned());
match extension { match extension {
Some(e) => Ok(match e.to_owned().as_ref() { Some(e) => Ok(match e.to_owned().as_ref() {
"gz" => Box::new(flate2::read::MultiGzDecoder::new(inp)), "gz" => SpecRead::Gz(flate2::read::MultiGzDecoder::new(inp)),
"bz2" => Box::new(bzip2::read::BzDecoder::new(inp)), "bz2" => SpecRead::Bz2(bzip2::read::BzDecoder::new(inp)),
"xz" => Box::new(xz2::read::XzDecoder::new_multi_decoder(inp)), "xz" => SpecRead::Xz(xz2::read::XzDecoder::new_multi_decoder(inp)),
"zst" => Box::new(zstd::stream::read::Decoder::new(inp)?), "zst" => SpecRead::Zst(zstd::stream::read::Decoder::new(inp)?),
e => Err(format_err!("don't know how to decompress {}", e))?, "tar" => SpecRead::Passthrough(inp),
ext => Err(format_err!("don't know how to decompress {}", ext))?,
}), }),
None => Err(format_err!("no extension")), None => Err(format_err!("no extension")),
} }
}*/ }
impl FileAdapter for TarAdapter { impl FileAdapter for TarAdapter {
fn adapt<'a>(&self, ai: AdaptInfo) -> Fallible<()> { fn adapt(&self, ai: AdaptInfo) -> Fallible<()> {
use std::io::prelude::*;
let AdaptInfo { let AdaptInfo {
filepath_hint, filepath_hint,
mut inp, mut inp,
@ -62,7 +92,8 @@ impl FileAdapter for TarAdapter {
line_prefix, line_prefix,
.. ..
} = ai; } = ai;
let decompress = inp; //decompress_any(filepath_hint, &inp)?;
let decompress = decompress_any(filepath_hint, &mut inp)?;
let mut archive = ::tar::Archive::new(decompress); let mut archive = ::tar::Archive::new(decompress);
for entry in archive.entries()? { for entry in archive.entries()? {
let mut file = entry.unwrap(); let mut file = entry.unwrap();
@ -75,15 +106,13 @@ impl FileAdapter for TarAdapter {
); );
if Regular == file.header().entry_type() { if Regular == file.header().entry_type() {
let line_prefix = &format!("{}{}: ", line_prefix, path.display()); let line_prefix = &format!("{}{}: ", line_prefix, path.display());
rga_preproc( let ai2: AdaptInfo = AdaptInfo {
AdaptInfo {
filepath_hint: &path, filepath_hint: &path,
inp: &mut file, inp: &mut file,
oup: oup, oup: oup,
line_prefix, line_prefix,
}, };
None, rga_preproc(ai2, None)?;
)?;
} }
} }
Ok(()) Ok(())

View File

@ -24,9 +24,8 @@ fn main() -> Result<(), Error> {
}; };
let cache_db = match env::var("RGA_NO_CACHE") { let cache_db = match env::var("RGA_NO_CACHE") {
Ok(ref s) if s.len() > 0 => Some(open_cache_db()?), Ok(ref s) if s.len() > 0 => None,
Ok(_) => None, Ok(_) | Err(_) => Some(open_cache_db()?),
Err(_) => None,
}; };
rga_preproc(ai, cache_db) rga_preproc(ai, cache_db)

View File

@ -2,7 +2,9 @@ use crate::adapters::*;
use crate::CachingWriter; use crate::CachingWriter;
use failure::{format_err, Error}; use failure::{format_err, Error};
use path_clean::PathClean; use path_clean::PathClean;
use std::fs::File;
use std::io::Read; use std::io::Read;
use std::io::Write;
use std::path::Path; use std::path::Path;
use std::path::PathBuf; use std::path::PathBuf;
use std::rc::Rc; use std::rc::Rc;
@ -21,16 +23,21 @@ pub fn open_cache_db() -> Result<std::sync::Arc<std::sync::RwLock<rkv::Rkv>>, Er
let mut builder = rkv::Rkv::environment_builder(); let mut builder = rkv::Rkv::environment_builder();
builder builder
.set_flags(rkv::EnvironmentFlags::NO_SYNC | rkv::EnvironmentFlags::WRITE_MAP) // not durable .set_flags(rkv::EnvironmentFlags::NO_SYNC | rkv::EnvironmentFlags::WRITE_MAP) // not durable
// i'm not sure why this is needed. otherwise LMDB transactions (open readers) will keep piling up until it fails with
// LmdbError(ReadersFull)
// hope it doesn't break integrity
.set_flags(rkv::EnvironmentFlags::NO_TLS)
.set_map_size(2 * 1024 * 1024 * 1024) .set_map_size(2 * 1024 * 1024 * 1024)
.set_max_dbs(100); .set_max_dbs(100)
.set_max_readers(128);
rkv::Rkv::from_env(p, builder) rkv::Rkv::from_env(p, builder)
}) })
.expect("could not get/create db"); .expect("could not get/create db");
Ok(db_arc) Ok(db_arc)
} }
pub fn rga_preproc( pub fn rga_preproc<'a>(
ai: AdaptInfo, ai: AdaptInfo<'a>,
mb_db_arc: Option<std::sync::Arc<std::sync::RwLock<rkv::Rkv>>>, mb_db_arc: Option<std::sync::Arc<std::sync::RwLock<rkv::Rkv>>>,
) -> Result<(), Error> { ) -> Result<(), Error> {
let adapters = adapter_matcher()?; let adapters = adapter_matcher()?;
@ -79,6 +86,7 @@ pub fn rga_preproc(
let db = db_env let db = db_env
.open_single(db_name.as_str(), rkv::store::Options::create()) .open_single(db_name.as_str(), rkv::store::Options::create())
.map_err(|p| format_err!("could not open db store: {:?}", p))?; .map_err(|p| format_err!("could not open db store: {:?}", p))?;
let reader = db_env.read().expect("could not get reader"); let reader = db_env.read().expect("could not get reader");
let cached = db let cached = db
.get(&reader, &cache_key) .get(&reader, &cache_key)
@ -91,6 +99,7 @@ pub fn rga_preproc(
} }
Some(_) => Err(format_err!("Integrity: value not blob")), Some(_) => Err(format_err!("Integrity: value not blob")),
None => { None => {
drop(reader);
let mut compbuf = CachingWriter::new(oup, MAX_DB_BLOB_LEN, ZSTD_LEVEL)?; let mut compbuf = CachingWriter::new(oup, MAX_DB_BLOB_LEN, ZSTD_LEVEL)?;
// start dupe // start dupe
eprintln!("adapting..."); eprintln!("adapting...");