fix cache key

This commit is contained in:
phiresky 2019-06-16 12:19:01 +02:00
parent 5447c4ac3a
commit 5901cdcb5d
13 changed files with 38 additions and 15 deletions

View File

@ -1,6 +1,7 @@
# 0.9.1 (2019-06-16)
- Add enabled adapters to cache key if caching for archive
- Prevent empty trailing page output in pdf reader
# 0.9.0 (2019-06-16)

View File

@ -26,7 +26,9 @@ pub struct AdapterMeta {
/// version identifier. used to key cache entries, change if your output format changes
pub version: i32,
pub description: String,
/// list of matchers (interpreted as ORed)
/// indicates whether this adapter can descend (=call rga_preproc again). if true, the cache key needs to include the list of active adapters
pub recurses: bool,
/// list of matchers (interpreted as a OR b OR ...)
pub fast_matchers: Vec<FastMatcher>,
/// list of matchers when we have mime type detection active (interpreted as ORed)
/// warning: this *overrides* the fast matchers
@ -71,7 +73,6 @@ pub struct AdaptInfo<'a> {
pub oup: &'a mut (dyn Write + Send),
/// prefix every output line with this string to better indicate the file's location if it is in some archive
pub line_prefix: &'a str,
// pub adapt_subobject: &'a dyn Fn(AdaptInfo) -> Fallible<()>,
pub config: PreprocConfig<'a>,
}

View File

@ -19,6 +19,7 @@ lazy_static! {
description:
"Reads compressed file as a stream and runs a different extractor on the contents."
.to_owned(),
recurses: true,
fast_matchers: EXTENSIONS
.iter()
.map(|s| FastMatcher::FileExtension(s.to_string()))

View File

@ -16,6 +16,7 @@ lazy_static! {
name: "ffmpeg".to_owned(),
version: 1,
description: "Uses ffmpeg to extract video metadata/chapters and subtitles".to_owned(),
recurses: false,
fast_matchers: EXTENSIONS
.iter()
.map(|s| FastMatcher::FileExtension(s.to_string()))
@ -47,7 +48,7 @@ struct FFprobeStream {
codec_type: String, // video,audio,subtitle
}
impl FileAdapter for FFmpegAdapter {
fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> {
fn adapt(&self, ai: AdaptInfo, _detection_reason: &SlowMatcher) -> Fallible<()> {
let AdaptInfo {
is_real_file,
filepath_hint,

View File

@ -48,6 +48,7 @@ lazy_static! {
description:
"Uses pandoc to convert binary/unreadable text documents to plain markdown-like text"
.to_owned(),
recurses: false,
fast_matchers: EXTENSIONS
.iter()
.map(|s| FastMatcher::FileExtension(s.to_string()))

View File

@ -17,6 +17,7 @@ lazy_static! {
name: "pdfpages".to_owned(),
version: 1,
description: "Converts a pdf to it's individual pages as png files. Only useful in combination with tesseract".to_owned(),
recurses: true,
fast_matchers: EXTENSIONS
.iter()
.map(|s| FastMatcher::FileExtension(s.to_string()))
@ -42,7 +43,7 @@ impl GetMetadata for PdfPagesAdapter {
/// A pdf is basically converted to a zip that has Page X.png files.
/// This way, something like tesseract can process the pages individually
impl FileAdapter for PdfPagesAdapter {
fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> {
fn adapt(&self, ai: AdaptInfo, _detection_reason: &SlowMatcher) -> Fallible<()> {
let AdaptInfo {
filepath_hint,
is_real_file,

View File

@ -12,6 +12,7 @@ lazy_static! {
version: 1,
description: "Uses pdftotext (from poppler-utils) to extract plain text from PDF files"
.to_owned(),
recurses: false,
fast_matchers: EXTENSIONS
.iter()
.map(|s| FastMatcher::FileExtension(s.to_string()))
@ -50,6 +51,9 @@ impl SpawningFileAdapter for PopplerAdapter {
// page break
line = line.replace('\x0c', "");
page += 1;
if line.is_empty() {
continue;
}
}
oup.write_all(format!("{}Page {}: {}\n", line_prefix, page, line).as_bytes())?;
}

View File

@ -93,7 +93,7 @@ impl<T> FileAdapter for T
where
T: SpawningFileAdapter,
{
fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> {
fn adapt(&self, ai: AdaptInfo, _detection_reason: &SlowMatcher) -> Fallible<()> {
let AdaptInfo {
filepath_hint,
mut inp,

View File

@ -14,6 +14,7 @@ lazy_static! {
description:
"Uses sqlite bindings to convert sqlite databases into a simple plain text format"
.to_owned(),
recurses: false, // set to true if we decide to make sqlite blobs searchable (gz blob in db is kinda common I think)
fast_matchers: EXTENSIONS
.iter()
.map(|s| FastMatcher::FileExtension(s.to_string()))
@ -56,7 +57,7 @@ fn format_blob(b: ValueRef) -> String {
}
impl FileAdapter for SqliteAdapter {
fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> {
fn adapt(&self, ai: AdaptInfo, _detection_reason: &SlowMatcher) -> Fallible<()> {
let AdaptInfo {
is_real_file,
filepath_hint,

View File

@ -13,6 +13,7 @@ lazy_static! {
name: "tar".to_owned(),
version: 1,
description: "Reads a tar file as a stream and recurses down into its contents".to_owned(),
recurses: true,
fast_matchers: EXTENSIONS
.iter()
.map(|s| FastMatcher::FileExtension(s.to_string()))
@ -35,7 +36,7 @@ impl GetMetadata for TarAdapter {
}
impl FileAdapter for TarAdapter {
fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> {
fn adapt(&self, ai: AdaptInfo, _detection_reason: &SlowMatcher) -> Fallible<()> {
let AdaptInfo {
filepath_hint,
mut inp,

View File

@ -10,6 +10,7 @@ lazy_static! {
name: "tesseract".to_owned(),
version: 1,
description: "Uses tesseract to run OCR on images to make them searchable. May need -j1 to prevent overloading the system. Make sure you have tesseract installed.".to_owned(),
recurses: false,
fast_matchers: EXTENSIONS
.iter()
.map(|s| FastMatcher::FileExtension(s.to_string()))

View File

@ -14,6 +14,7 @@ lazy_static! {
name: "zip".to_owned(),
version: 1,
description: "Reads a zip file as a stream and recurses down into its contents".to_owned(),
recurses: true,
fast_matchers: EXTENSIONS
.iter()
.map(|s| FastMatcher::FileExtension(s.to_string()))
@ -45,7 +46,7 @@ fn is_dir(f: &ZipFile) -> bool {
}
impl FileAdapter for ZipAdapter {
fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> {
fn adapt(&self, ai: AdaptInfo, _detection_reason: &SlowMatcher) -> Fallible<()> {
let AdaptInfo {
filepath_hint,
mut inp,

View File

@ -72,13 +72,22 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> {
let clean_path = filepath_hint.to_owned().clean();
let meta = std::fs::metadata(&filepath_hint)?;
let key = (
clean_path,
meta.modified().expect("weird OS that can't into mtime"),
);
eprintln!("cache key: {:?}", key);
bincode::serialize(&key).expect("could not serialize path") // key in the cache database
if adapter.metadata().recurses {
let key = (
clean_path,
meta.modified().expect("weird OS that can't into mtime"),
&args.adapters[..],
);
eprintln!("cache key: {:?}", key);
bincode::serialize(&key).expect("could not serialize path") // key in the cache database
} else {
let key = (
clean_path,
meta.modified().expect("weird OS that can't into mtime"),
);
eprintln!("cache key: {:?}", key);
bincode::serialize(&key).expect("could not serialize path") // key in the cache database
}
};
cache.write().unwrap().get_or_run(
&db_name,