mirror of
https://github.com/FliegendeWurst/ripgrep-all.git
synced 2024-11-24 04:14:57 +00:00
finally fix tar
This commit is contained in:
parent
83114f66bf
commit
83b804bef2
24
Cargo.lock
generated
24
Cargo.lock
generated
@ -131,6 +131,16 @@ name = "cfg-if"
|
||||
version = "0.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
version = "0.4.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"num-integer 0.1.41 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "2.33.0"
|
||||
@ -483,6 +493,15 @@ name = "nom"
|
||||
version = "2.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "num-integer"
|
||||
version = "0.1.41"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"autocfg 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-traits"
|
||||
version = "0.2.8"
|
||||
@ -752,11 +771,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rga"
|
||||
version = "0.1.0"
|
||||
version = "0.2.0"
|
||||
dependencies = [
|
||||
"bincode 1.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"bzip2 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"cachedir 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"chrono 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"crossbeam 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"env_logger 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
@ -1127,6 +1147,7 @@ dependencies = [
|
||||
"checksum cachedir 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c06509d1f4ffa658939bd23f076cd929ef218241363796551528e7eec69128c8"
|
||||
"checksum cc 1.0.37 (registry+https://github.com/rust-lang/crates.io-index)" = "39f75544d7bbaf57560d2168f28fd649ff9c76153874db88bdbdfd839b1a7e7d"
|
||||
"checksum cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "b486ce3ccf7ffd79fdeb678eac06a9e6c09fc88d33836340becb8fffe87c5e33"
|
||||
"checksum chrono 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "45912881121cb26fad7c38c17ba7daa18764771836b34fab7d3fbd93ed633878"
|
||||
"checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9"
|
||||
"checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
|
||||
"checksum crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb"
|
||||
@ -1169,6 +1190,7 @@ dependencies = [
|
||||
"checksum miniz_oxide_c_api 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b7fe927a42e3807ef71defb191dc87d4e24479b221e67015fe38ae2b7b447bab"
|
||||
"checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945"
|
||||
"checksum nom 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cf51a729ecf40266a2368ad335a5fdde43471f545a967109cd62146ecf8b66ff"
|
||||
"checksum num-integer 0.1.41 (registry+https://github.com/rust-lang/crates.io-index)" = "b85e541ef8255f6cf42bbfe4ef361305c6c135d10919ecc26126c4e5ae94bc09"
|
||||
"checksum num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "6ba9a427cfca2be13aa6f6403b0b7e7368fe982bfa16fccc450ce74c46cd9b32"
|
||||
"checksum num_cpus 1.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1a23f0ed30a54abaa0c7e83b1d2d87ada7c3c23078d1d87815af3e3b6385fbba"
|
||||
"checksum numtoa 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b8f8bdf33df195859076e54ab11ee78a1b208382d3a26ec40d142ffc1ecc49ef"
|
||||
|
@ -5,7 +5,7 @@ cargo-features = ["default-run"]
|
||||
name = "rga"
|
||||
description = "ripgrep, except for pdf, ebooks, Office documents, etc"
|
||||
license = "AGPL-3.0-or-later"
|
||||
version = "0.1.0"
|
||||
version = "0.2.0"
|
||||
repository = "https://github.com/phiresky/rga"
|
||||
authors = ["phiresky <phireskyde+git@gmail.com>"]
|
||||
edition = "2018"
|
||||
@ -37,3 +37,4 @@ xz2 = "0.1.6"
|
||||
flate2 = "1.0.7"
|
||||
bzip2 = "0.3.3"
|
||||
tar = "0.4.26"
|
||||
chrono = "0.4.6"
|
||||
|
@ -3,6 +3,10 @@ similar:
|
||||
- pdfgrep
|
||||
- https://gist.github.com/ColonolBuendia/314826e37ec35c616d70506c38dc65aa
|
||||
|
||||
# todo
|
||||
|
||||
- jpg adapter (based on object classification / detection (yolo?)) for fun
|
||||
|
||||
# considerations
|
||||
|
||||
- matching on mime (magic bytes) instead of filename
|
||||
|
BIN
exampledir/test.tar
Normal file
BIN
exampledir/test.tar
Normal file
Binary file not shown.
Binary file not shown.
@ -4,6 +4,7 @@ use ::tar::EntryType::Regular;
|
||||
use failure::*;
|
||||
use lazy_static::lazy_static;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::path::PathBuf;
|
||||
|
||||
static EXTENSIONS: &[&str] = &["tar", "tar.gz", "tar.bz2", "tar.xz", "tar.zst"];
|
||||
@ -31,30 +32,59 @@ impl GetMetadata for TarAdapter {
|
||||
&METADATA
|
||||
}
|
||||
}
|
||||
/*struct WrapRead<'a> {
|
||||
inner: &mut 'a Read;
|
||||
}
|
||||
impl Read for WrapRead {
|
||||
r
|
||||
}*/
|
||||
|
||||
/*fn decompress_any(filename: &Path, inp: &mut Read) -> Fallible<Box<Read>> {
|
||||
// make a &mut Read into a owned Read because the streaming decompressors want to take ownership of their base Reads
|
||||
struct WrapRead<'a> {
|
||||
inner: &'a mut dyn Read,
|
||||
}
|
||||
impl<'a> Read for WrapRead<'a> {
|
||||
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
|
||||
self.inner.read(buf)
|
||||
}
|
||||
}
|
||||
|
||||
// feeling a little stupid here. why is this needed at all
|
||||
enum SpecRead<R: Read> {
|
||||
Gz(flate2::read::MultiGzDecoder<R>),
|
||||
Bz2(bzip2::read::BzDecoder<R>),
|
||||
Xz(xz2::read::XzDecoder<R>),
|
||||
Zst(zstd::stream::read::Decoder<BufReader<R>>),
|
||||
Passthrough(R),
|
||||
}
|
||||
impl<R: Read> Read for SpecRead<R> {
|
||||
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
|
||||
use SpecRead::*;
|
||||
match self {
|
||||
Gz(z) => z.read(buf),
|
||||
Bz2(z) => z.read(buf),
|
||||
Xz(z) => z.read(buf),
|
||||
Zst(z) => z.read(buf),
|
||||
Passthrough(z) => z.read(buf),
|
||||
}
|
||||
}
|
||||
}
|
||||
// why do I need to wrap the output here in a specific type? is it possible with just a Box<Read> for every type?
|
||||
fn decompress_any<'a, R>(filename: &Path, inp: &'a mut R) -> Fallible<SpecRead<WrapRead<'a>>>
|
||||
where
|
||||
R: Read,
|
||||
{
|
||||
let inp = WrapRead { inner: inp };
|
||||
let extension = filename.extension().map(|e| e.to_string_lossy().to_owned());
|
||||
match extension {
|
||||
Some(e) => Ok(match e.to_owned().as_ref() {
|
||||
"gz" => Box::new(flate2::read::MultiGzDecoder::new(inp)),
|
||||
"bz2" => Box::new(bzip2::read::BzDecoder::new(inp)),
|
||||
"xz" => Box::new(xz2::read::XzDecoder::new_multi_decoder(inp)),
|
||||
"zst" => Box::new(zstd::stream::read::Decoder::new(inp)?),
|
||||
e => Err(format_err!("don't know how to decompress {}", e))?,
|
||||
"gz" => SpecRead::Gz(flate2::read::MultiGzDecoder::new(inp)),
|
||||
"bz2" => SpecRead::Bz2(bzip2::read::BzDecoder::new(inp)),
|
||||
"xz" => SpecRead::Xz(xz2::read::XzDecoder::new_multi_decoder(inp)),
|
||||
"zst" => SpecRead::Zst(zstd::stream::read::Decoder::new(inp)?),
|
||||
"tar" => SpecRead::Passthrough(inp),
|
||||
ext => Err(format_err!("don't know how to decompress {}", ext))?,
|
||||
}),
|
||||
None => Err(format_err!("no extension")),
|
||||
}
|
||||
}*/
|
||||
}
|
||||
|
||||
impl FileAdapter for TarAdapter {
|
||||
fn adapt<'a>(&self, ai: AdaptInfo) -> Fallible<()> {
|
||||
use std::io::prelude::*;
|
||||
fn adapt(&self, ai: AdaptInfo) -> Fallible<()> {
|
||||
let AdaptInfo {
|
||||
filepath_hint,
|
||||
mut inp,
|
||||
@ -62,7 +92,8 @@ impl FileAdapter for TarAdapter {
|
||||
line_prefix,
|
||||
..
|
||||
} = ai;
|
||||
let decompress = inp; //decompress_any(filepath_hint, &inp)?;
|
||||
|
||||
let decompress = decompress_any(filepath_hint, &mut inp)?;
|
||||
let mut archive = ::tar::Archive::new(decompress);
|
||||
for entry in archive.entries()? {
|
||||
let mut file = entry.unwrap();
|
||||
@ -75,15 +106,13 @@ impl FileAdapter for TarAdapter {
|
||||
);
|
||||
if Regular == file.header().entry_type() {
|
||||
let line_prefix = &format!("{}{}: ", line_prefix, path.display());
|
||||
rga_preproc(
|
||||
AdaptInfo {
|
||||
let ai2: AdaptInfo = AdaptInfo {
|
||||
filepath_hint: &path,
|
||||
inp: &mut file,
|
||||
oup: oup,
|
||||
line_prefix,
|
||||
},
|
||||
None,
|
||||
)?;
|
||||
};
|
||||
rga_preproc(ai2, None)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
|
@ -24,9 +24,8 @@ fn main() -> Result<(), Error> {
|
||||
};
|
||||
|
||||
let cache_db = match env::var("RGA_NO_CACHE") {
|
||||
Ok(ref s) if s.len() > 0 => Some(open_cache_db()?),
|
||||
Ok(_) => None,
|
||||
Err(_) => None,
|
||||
Ok(ref s) if s.len() > 0 => None,
|
||||
Ok(_) | Err(_) => Some(open_cache_db()?),
|
||||
};
|
||||
|
||||
rga_preproc(ai, cache_db)
|
||||
|
@ -2,7 +2,9 @@ use crate::adapters::*;
|
||||
use crate::CachingWriter;
|
||||
use failure::{format_err, Error};
|
||||
use path_clean::PathClean;
|
||||
use std::fs::File;
|
||||
use std::io::Read;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
use std::path::PathBuf;
|
||||
use std::rc::Rc;
|
||||
@ -21,16 +23,21 @@ pub fn open_cache_db() -> Result<std::sync::Arc<std::sync::RwLock<rkv::Rkv>>, Er
|
||||
let mut builder = rkv::Rkv::environment_builder();
|
||||
builder
|
||||
.set_flags(rkv::EnvironmentFlags::NO_SYNC | rkv::EnvironmentFlags::WRITE_MAP) // not durable
|
||||
// i'm not sure why this is needed. otherwise LMDB transactions (open readers) will keep piling up until it fails with
|
||||
// LmdbError(ReadersFull)
|
||||
// hope it doesn't break integrity
|
||||
.set_flags(rkv::EnvironmentFlags::NO_TLS)
|
||||
.set_map_size(2 * 1024 * 1024 * 1024)
|
||||
.set_max_dbs(100);
|
||||
.set_max_dbs(100)
|
||||
.set_max_readers(128);
|
||||
rkv::Rkv::from_env(p, builder)
|
||||
})
|
||||
.expect("could not get/create db");
|
||||
Ok(db_arc)
|
||||
}
|
||||
|
||||
pub fn rga_preproc(
|
||||
ai: AdaptInfo,
|
||||
pub fn rga_preproc<'a>(
|
||||
ai: AdaptInfo<'a>,
|
||||
mb_db_arc: Option<std::sync::Arc<std::sync::RwLock<rkv::Rkv>>>,
|
||||
) -> Result<(), Error> {
|
||||
let adapters = adapter_matcher()?;
|
||||
@ -79,6 +86,7 @@ pub fn rga_preproc(
|
||||
let db = db_env
|
||||
.open_single(db_name.as_str(), rkv::store::Options::create())
|
||||
.map_err(|p| format_err!("could not open db store: {:?}", p))?;
|
||||
|
||||
let reader = db_env.read().expect("could not get reader");
|
||||
let cached = db
|
||||
.get(&reader, &cache_key)
|
||||
@ -91,6 +99,7 @@ pub fn rga_preproc(
|
||||
}
|
||||
Some(_) => Err(format_err!("Integrity: value not blob")),
|
||||
None => {
|
||||
drop(reader);
|
||||
let mut compbuf = CachingWriter::new(oup, MAX_DB_BLOB_LEN, ZSTD_LEVEL)?;
|
||||
// start dupe
|
||||
eprintln!("adapting...");
|
||||
|
Loading…
Reference in New Issue
Block a user