mirror of
https://github.com/FliegendeWurst/ripgrep-all.git
synced 2024-11-09 14:30:37 +00:00
finally fix tar
This commit is contained in:
parent
83114f66bf
commit
83b804bef2
24
Cargo.lock
generated
24
Cargo.lock
generated
@ -131,6 +131,16 @@ name = "cfg-if"
|
|||||||
version = "0.1.9"
|
version = "0.1.9"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "chrono"
|
||||||
|
version = "0.4.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"num-integer 0.1.41 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "clap"
|
name = "clap"
|
||||||
version = "2.33.0"
|
version = "2.33.0"
|
||||||
@ -483,6 +493,15 @@ name = "nom"
|
|||||||
version = "2.2.1"
|
version = "2.2.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "num-integer"
|
||||||
|
version = "0.1.41"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"autocfg 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "num-traits"
|
name = "num-traits"
|
||||||
version = "0.2.8"
|
version = "0.2.8"
|
||||||
@ -752,11 +771,12 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rga"
|
name = "rga"
|
||||||
version = "0.1.0"
|
version = "0.2.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bincode 1.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
"bincode 1.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"bzip2 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
"bzip2 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"cachedir 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"cachedir 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"chrono 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"crossbeam 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"crossbeam 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"env_logger 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"env_logger 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
@ -1127,6 +1147,7 @@ dependencies = [
|
|||||||
"checksum cachedir 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c06509d1f4ffa658939bd23f076cd929ef218241363796551528e7eec69128c8"
|
"checksum cachedir 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c06509d1f4ffa658939bd23f076cd929ef218241363796551528e7eec69128c8"
|
||||||
"checksum cc 1.0.37 (registry+https://github.com/rust-lang/crates.io-index)" = "39f75544d7bbaf57560d2168f28fd649ff9c76153874db88bdbdfd839b1a7e7d"
|
"checksum cc 1.0.37 (registry+https://github.com/rust-lang/crates.io-index)" = "39f75544d7bbaf57560d2168f28fd649ff9c76153874db88bdbdfd839b1a7e7d"
|
||||||
"checksum cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "b486ce3ccf7ffd79fdeb678eac06a9e6c09fc88d33836340becb8fffe87c5e33"
|
"checksum cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "b486ce3ccf7ffd79fdeb678eac06a9e6c09fc88d33836340becb8fffe87c5e33"
|
||||||
|
"checksum chrono 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "45912881121cb26fad7c38c17ba7daa18764771836b34fab7d3fbd93ed633878"
|
||||||
"checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9"
|
"checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9"
|
||||||
"checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
|
"checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
|
||||||
"checksum crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb"
|
"checksum crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb"
|
||||||
@ -1169,6 +1190,7 @@ dependencies = [
|
|||||||
"checksum miniz_oxide_c_api 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b7fe927a42e3807ef71defb191dc87d4e24479b221e67015fe38ae2b7b447bab"
|
"checksum miniz_oxide_c_api 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b7fe927a42e3807ef71defb191dc87d4e24479b221e67015fe38ae2b7b447bab"
|
||||||
"checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945"
|
"checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945"
|
||||||
"checksum nom 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cf51a729ecf40266a2368ad335a5fdde43471f545a967109cd62146ecf8b66ff"
|
"checksum nom 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cf51a729ecf40266a2368ad335a5fdde43471f545a967109cd62146ecf8b66ff"
|
||||||
|
"checksum num-integer 0.1.41 (registry+https://github.com/rust-lang/crates.io-index)" = "b85e541ef8255f6cf42bbfe4ef361305c6c135d10919ecc26126c4e5ae94bc09"
|
||||||
"checksum num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "6ba9a427cfca2be13aa6f6403b0b7e7368fe982bfa16fccc450ce74c46cd9b32"
|
"checksum num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "6ba9a427cfca2be13aa6f6403b0b7e7368fe982bfa16fccc450ce74c46cd9b32"
|
||||||
"checksum num_cpus 1.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1a23f0ed30a54abaa0c7e83b1d2d87ada7c3c23078d1d87815af3e3b6385fbba"
|
"checksum num_cpus 1.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1a23f0ed30a54abaa0c7e83b1d2d87ada7c3c23078d1d87815af3e3b6385fbba"
|
||||||
"checksum numtoa 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b8f8bdf33df195859076e54ab11ee78a1b208382d3a26ec40d142ffc1ecc49ef"
|
"checksum numtoa 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b8f8bdf33df195859076e54ab11ee78a1b208382d3a26ec40d142ffc1ecc49ef"
|
||||||
|
@ -5,7 +5,7 @@ cargo-features = ["default-run"]
|
|||||||
name = "rga"
|
name = "rga"
|
||||||
description = "ripgrep, except for pdf, ebooks, Office documents, etc"
|
description = "ripgrep, except for pdf, ebooks, Office documents, etc"
|
||||||
license = "AGPL-3.0-or-later"
|
license = "AGPL-3.0-or-later"
|
||||||
version = "0.1.0"
|
version = "0.2.0"
|
||||||
repository = "https://github.com/phiresky/rga"
|
repository = "https://github.com/phiresky/rga"
|
||||||
authors = ["phiresky <phireskyde+git@gmail.com>"]
|
authors = ["phiresky <phireskyde+git@gmail.com>"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
@ -37,3 +37,4 @@ xz2 = "0.1.6"
|
|||||||
flate2 = "1.0.7"
|
flate2 = "1.0.7"
|
||||||
bzip2 = "0.3.3"
|
bzip2 = "0.3.3"
|
||||||
tar = "0.4.26"
|
tar = "0.4.26"
|
||||||
|
chrono = "0.4.6"
|
||||||
|
@ -3,6 +3,10 @@ similar:
|
|||||||
- pdfgrep
|
- pdfgrep
|
||||||
- https://gist.github.com/ColonolBuendia/314826e37ec35c616d70506c38dc65aa
|
- https://gist.github.com/ColonolBuendia/314826e37ec35c616d70506c38dc65aa
|
||||||
|
|
||||||
|
# todo
|
||||||
|
|
||||||
|
- jpg adapter (based on object classification / detection (yolo?)) for fun
|
||||||
|
|
||||||
# considerations
|
# considerations
|
||||||
|
|
||||||
- matching on mime (magic bytes) instead of filename
|
- matching on mime (magic bytes) instead of filename
|
||||||
|
BIN
exampledir/test.tar
Normal file
BIN
exampledir/test.tar
Normal file
Binary file not shown.
Binary file not shown.
@ -4,6 +4,7 @@ use ::tar::EntryType::Regular;
|
|||||||
use failure::*;
|
use failure::*;
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
use std::io::BufReader;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
static EXTENSIONS: &[&str] = &["tar", "tar.gz", "tar.bz2", "tar.xz", "tar.zst"];
|
static EXTENSIONS: &[&str] = &["tar", "tar.gz", "tar.bz2", "tar.xz", "tar.zst"];
|
||||||
@ -31,30 +32,59 @@ impl GetMetadata for TarAdapter {
|
|||||||
&METADATA
|
&METADATA
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*struct WrapRead<'a> {
|
|
||||||
inner: &mut 'a Read;
|
|
||||||
}
|
|
||||||
impl Read for WrapRead {
|
|
||||||
r
|
|
||||||
}*/
|
|
||||||
|
|
||||||
/*fn decompress_any(filename: &Path, inp: &mut Read) -> Fallible<Box<Read>> {
|
// make a &mut Read into a owned Read because the streaming decompressors want to take ownership of their base Reads
|
||||||
|
struct WrapRead<'a> {
|
||||||
|
inner: &'a mut dyn Read,
|
||||||
|
}
|
||||||
|
impl<'a> Read for WrapRead<'a> {
|
||||||
|
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
|
||||||
|
self.inner.read(buf)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// feeling a little stupid here. why is this needed at all
|
||||||
|
enum SpecRead<R: Read> {
|
||||||
|
Gz(flate2::read::MultiGzDecoder<R>),
|
||||||
|
Bz2(bzip2::read::BzDecoder<R>),
|
||||||
|
Xz(xz2::read::XzDecoder<R>),
|
||||||
|
Zst(zstd::stream::read::Decoder<BufReader<R>>),
|
||||||
|
Passthrough(R),
|
||||||
|
}
|
||||||
|
impl<R: Read> Read for SpecRead<R> {
|
||||||
|
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
|
||||||
|
use SpecRead::*;
|
||||||
|
match self {
|
||||||
|
Gz(z) => z.read(buf),
|
||||||
|
Bz2(z) => z.read(buf),
|
||||||
|
Xz(z) => z.read(buf),
|
||||||
|
Zst(z) => z.read(buf),
|
||||||
|
Passthrough(z) => z.read(buf),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// why do I need to wrap the output here in a specific type? is it possible with just a Box<Read> for every type?
|
||||||
|
fn decompress_any<'a, R>(filename: &Path, inp: &'a mut R) -> Fallible<SpecRead<WrapRead<'a>>>
|
||||||
|
where
|
||||||
|
R: Read,
|
||||||
|
{
|
||||||
|
let inp = WrapRead { inner: inp };
|
||||||
let extension = filename.extension().map(|e| e.to_string_lossy().to_owned());
|
let extension = filename.extension().map(|e| e.to_string_lossy().to_owned());
|
||||||
match extension {
|
match extension {
|
||||||
Some(e) => Ok(match e.to_owned().as_ref() {
|
Some(e) => Ok(match e.to_owned().as_ref() {
|
||||||
"gz" => Box::new(flate2::read::MultiGzDecoder::new(inp)),
|
"gz" => SpecRead::Gz(flate2::read::MultiGzDecoder::new(inp)),
|
||||||
"bz2" => Box::new(bzip2::read::BzDecoder::new(inp)),
|
"bz2" => SpecRead::Bz2(bzip2::read::BzDecoder::new(inp)),
|
||||||
"xz" => Box::new(xz2::read::XzDecoder::new_multi_decoder(inp)),
|
"xz" => SpecRead::Xz(xz2::read::XzDecoder::new_multi_decoder(inp)),
|
||||||
"zst" => Box::new(zstd::stream::read::Decoder::new(inp)?),
|
"zst" => SpecRead::Zst(zstd::stream::read::Decoder::new(inp)?),
|
||||||
e => Err(format_err!("don't know how to decompress {}", e))?,
|
"tar" => SpecRead::Passthrough(inp),
|
||||||
|
ext => Err(format_err!("don't know how to decompress {}", ext))?,
|
||||||
}),
|
}),
|
||||||
None => Err(format_err!("no extension")),
|
None => Err(format_err!("no extension")),
|
||||||
}
|
}
|
||||||
}*/
|
}
|
||||||
|
|
||||||
impl FileAdapter for TarAdapter {
|
impl FileAdapter for TarAdapter {
|
||||||
fn adapt<'a>(&self, ai: AdaptInfo) -> Fallible<()> {
|
fn adapt(&self, ai: AdaptInfo) -> Fallible<()> {
|
||||||
use std::io::prelude::*;
|
|
||||||
let AdaptInfo {
|
let AdaptInfo {
|
||||||
filepath_hint,
|
filepath_hint,
|
||||||
mut inp,
|
mut inp,
|
||||||
@ -62,7 +92,8 @@ impl FileAdapter for TarAdapter {
|
|||||||
line_prefix,
|
line_prefix,
|
||||||
..
|
..
|
||||||
} = ai;
|
} = ai;
|
||||||
let decompress = inp; //decompress_any(filepath_hint, &inp)?;
|
|
||||||
|
let decompress = decompress_any(filepath_hint, &mut inp)?;
|
||||||
let mut archive = ::tar::Archive::new(decompress);
|
let mut archive = ::tar::Archive::new(decompress);
|
||||||
for entry in archive.entries()? {
|
for entry in archive.entries()? {
|
||||||
let mut file = entry.unwrap();
|
let mut file = entry.unwrap();
|
||||||
@ -75,15 +106,13 @@ impl FileAdapter for TarAdapter {
|
|||||||
);
|
);
|
||||||
if Regular == file.header().entry_type() {
|
if Regular == file.header().entry_type() {
|
||||||
let line_prefix = &format!("{}{}: ", line_prefix, path.display());
|
let line_prefix = &format!("{}{}: ", line_prefix, path.display());
|
||||||
rga_preproc(
|
let ai2: AdaptInfo = AdaptInfo {
|
||||||
AdaptInfo {
|
|
||||||
filepath_hint: &path,
|
filepath_hint: &path,
|
||||||
inp: &mut file,
|
inp: &mut file,
|
||||||
oup: oup,
|
oup: oup,
|
||||||
line_prefix,
|
line_prefix,
|
||||||
},
|
};
|
||||||
None,
|
rga_preproc(ai2, None)?;
|
||||||
)?;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -24,9 +24,8 @@ fn main() -> Result<(), Error> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let cache_db = match env::var("RGA_NO_CACHE") {
|
let cache_db = match env::var("RGA_NO_CACHE") {
|
||||||
Ok(ref s) if s.len() > 0 => Some(open_cache_db()?),
|
Ok(ref s) if s.len() > 0 => None,
|
||||||
Ok(_) => None,
|
Ok(_) | Err(_) => Some(open_cache_db()?),
|
||||||
Err(_) => None,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
rga_preproc(ai, cache_db)
|
rga_preproc(ai, cache_db)
|
||||||
|
@ -2,7 +2,9 @@ use crate::adapters::*;
|
|||||||
use crate::CachingWriter;
|
use crate::CachingWriter;
|
||||||
use failure::{format_err, Error};
|
use failure::{format_err, Error};
|
||||||
use path_clean::PathClean;
|
use path_clean::PathClean;
|
||||||
|
use std::fs::File;
|
||||||
use std::io::Read;
|
use std::io::Read;
|
||||||
|
use std::io::Write;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::rc::Rc;
|
use std::rc::Rc;
|
||||||
@ -21,16 +23,21 @@ pub fn open_cache_db() -> Result<std::sync::Arc<std::sync::RwLock<rkv::Rkv>>, Er
|
|||||||
let mut builder = rkv::Rkv::environment_builder();
|
let mut builder = rkv::Rkv::environment_builder();
|
||||||
builder
|
builder
|
||||||
.set_flags(rkv::EnvironmentFlags::NO_SYNC | rkv::EnvironmentFlags::WRITE_MAP) // not durable
|
.set_flags(rkv::EnvironmentFlags::NO_SYNC | rkv::EnvironmentFlags::WRITE_MAP) // not durable
|
||||||
|
// i'm not sure why this is needed. otherwise LMDB transactions (open readers) will keep piling up until it fails with
|
||||||
|
// LmdbError(ReadersFull)
|
||||||
|
// hope it doesn't break integrity
|
||||||
|
.set_flags(rkv::EnvironmentFlags::NO_TLS)
|
||||||
.set_map_size(2 * 1024 * 1024 * 1024)
|
.set_map_size(2 * 1024 * 1024 * 1024)
|
||||||
.set_max_dbs(100);
|
.set_max_dbs(100)
|
||||||
|
.set_max_readers(128);
|
||||||
rkv::Rkv::from_env(p, builder)
|
rkv::Rkv::from_env(p, builder)
|
||||||
})
|
})
|
||||||
.expect("could not get/create db");
|
.expect("could not get/create db");
|
||||||
Ok(db_arc)
|
Ok(db_arc)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn rga_preproc(
|
pub fn rga_preproc<'a>(
|
||||||
ai: AdaptInfo,
|
ai: AdaptInfo<'a>,
|
||||||
mb_db_arc: Option<std::sync::Arc<std::sync::RwLock<rkv::Rkv>>>,
|
mb_db_arc: Option<std::sync::Arc<std::sync::RwLock<rkv::Rkv>>>,
|
||||||
) -> Result<(), Error> {
|
) -> Result<(), Error> {
|
||||||
let adapters = adapter_matcher()?;
|
let adapters = adapter_matcher()?;
|
||||||
@ -79,6 +86,7 @@ pub fn rga_preproc(
|
|||||||
let db = db_env
|
let db = db_env
|
||||||
.open_single(db_name.as_str(), rkv::store::Options::create())
|
.open_single(db_name.as_str(), rkv::store::Options::create())
|
||||||
.map_err(|p| format_err!("could not open db store: {:?}", p))?;
|
.map_err(|p| format_err!("could not open db store: {:?}", p))?;
|
||||||
|
|
||||||
let reader = db_env.read().expect("could not get reader");
|
let reader = db_env.read().expect("could not get reader");
|
||||||
let cached = db
|
let cached = db
|
||||||
.get(&reader, &cache_key)
|
.get(&reader, &cache_key)
|
||||||
@ -91,6 +99,7 @@ pub fn rga_preproc(
|
|||||||
}
|
}
|
||||||
Some(_) => Err(format_err!("Integrity: value not blob")),
|
Some(_) => Err(format_err!("Integrity: value not blob")),
|
||||||
None => {
|
None => {
|
||||||
|
drop(reader);
|
||||||
let mut compbuf = CachingWriter::new(oup, MAX_DB_BLOB_LEN, ZSTD_LEVEL)?;
|
let mut compbuf = CachingWriter::new(oup, MAX_DB_BLOB_LEN, ZSTD_LEVEL)?;
|
||||||
// start dupe
|
// start dupe
|
||||||
eprintln!("adapting...");
|
eprintln!("adapting...");
|
||||||
|
Loading…
Reference in New Issue
Block a user