From e98c60001d2d3a40afaafb341edcf4fe96ae8997 Mon Sep 17 00:00:00 2001 From: phiresky Date: Wed, 5 Jun 2019 16:43:40 +0200 Subject: [PATCH] implement caching --- Cargo.lock | 155 ++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 5 ++ src/adapters.rs | 14 ++-- src/adapters/ffmpeg.rs | 2 +- src/adapters/pandoc.rs | 3 +- src/adapters/poppler.rs | 2 +- src/bin/rga-preproc.rs | 84 ++++++++++++++++++---- src/caching_writer.rs | 70 ++++++++++++++++++ src/lib.rs | 2 + 9 files changed, 315 insertions(+), 22 deletions(-) create mode 100644 src/caching_writer.rs diff --git a/Cargo.lock b/Cargo.lock index 0b07d68..189a1b8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13,6 +13,14 @@ name = "arrayref" version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "arrayvec" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "autocfg" version = "0.1.4" @@ -38,10 +46,18 @@ name = "byteorder" version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "cachedir" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "cc" version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "rayon 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", +] [[package]] name = "cfg-if" @@ -56,6 +72,42 @@ dependencies = [ "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "crossbeam-deque" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-epoch 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "arrayvec 0.4.10 (registry+https://github.com/rust-lang/crates.io-index)", + "cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)", + "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-utils" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "either" +version = "1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "failure" version = "0.1.5" @@ -90,6 +142,11 @@ name = "fuchsia-cprng" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "glob" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "idna" version = "0.1.5" @@ -149,6 +206,16 @@ name = "memchr" version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "memoffset" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "nodrop" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "nom" version = "2.2.1" @@ -162,6 +229,14 @@ dependencies = [ "autocfg 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "num_cpus" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.57 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "ordered-float" version = "1.0.2" @@ -200,6 +275,11 @@ dependencies = [ "winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "path-clean" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "percent-encoding" version = "1.0.1" @@ -331,6 +411,27 @@ dependencies = [ "rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "rayon" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-deque 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "either 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)", + "rayon-core 1.4.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rayon-core" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-deque 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.57 (registry+https://github.com/rust-lang/crates.io-index)", + "num_cpus 1.10.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "rdrand" version = "0.4.0" @@ -368,9 +469,14 @@ dependencies = [ name = "rga" version = "0.1.0" dependencies = [ + "bincode 1.1.4 (registry+https://github.com/rust-lang/crates.io-index)", + "cachedir 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "path-clean 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.1.6 (registry+https://github.com/rust-lang/crates.io-index)", "rkv 0.9.5 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.92 (registry+https://github.com/rust-lang/crates.io-index)", "tree_magic_fork 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "zstd 0.4.24+zstd.1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -398,6 +504,11 @@ dependencies = [ "semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "scopeguard" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "scopeguard" version = "1.0.0" @@ -542,21 +653,55 @@ name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "zstd" +version = "0.4.24+zstd.1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "zstd-safe 1.4.9+zstd.1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "zstd-safe" +version = "1.4.9+zstd.1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.57 (registry+https://github.com/rust-lang/crates.io-index)", + "zstd-sys 1.4.10+zstd.1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "zstd-sys" +version = "1.4.10+zstd.1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cc 1.0.37 (registry+https://github.com/rust-lang/crates.io-index)", + "glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.57 (registry+https://github.com/rust-lang/crates.io-index)", +] + [metadata] "checksum aho-corasick 0.7.3 (registry+https://github.com/rust-lang/crates.io-index)" = "e6f484ae0c99fec2e858eb6134949117399f222608d84cadb3f58c1f97c2364c" "checksum arrayref 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "0d382e583f07208808f6b1249e60848879ba3543f57c32277bf52d69c2f0f0ee" +"checksum arrayvec 0.4.10 (registry+https://github.com/rust-lang/crates.io-index)" = "92c7fb76bc8826a8b33b4ee5bb07a247a81e76764ab4d55e8f73e3a4d8808c71" "checksum autocfg 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "0e49efa51329a5fd37e7c79db4621af617cd4e3e5bc224939808d076077077bf" "checksum bincode 1.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "9f04a5e50dc80b3d5d35320889053637d15011aed5e66b66b37ae798c65da6f7" "checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12" "checksum byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a019b10a2a7cdeb292db131fc8113e57ea2a908f6e7894b0c3c671893b65dbeb" +"checksum cachedir 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c06509d1f4ffa658939bd23f076cd929ef218241363796551528e7eec69128c8" "checksum cc 1.0.37 (registry+https://github.com/rust-lang/crates.io-index)" = "39f75544d7bbaf57560d2168f28fd649ff9c76153874db88bdbdfd839b1a7e7d" "checksum cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "b486ce3ccf7ffd79fdeb678eac06a9e6c09fc88d33836340becb8fffe87c5e33" "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" +"checksum crossbeam-deque 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f739f8c5363aca78cfb059edf753d8f0d36908c348f3d8d1503f03d8b75d9cf3" +"checksum crossbeam-epoch 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "927121f5407de9956180ff5e936fe3cf4324279280001cd56b669d28ee7e9150" +"checksum crossbeam-utils 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "2760899e32a1d58d5abb31129f8fae5de75220bc2176e77ff7c627ae45c918d9" +"checksum either 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "5527cfe0d098f36e3f8839852688e63c8fff1c90b2b405aef730615f9a7bcf7b" "checksum failure 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "795bd83d3abeb9220f257e597aa0080a508b27533824adf336529648f6abf7e2" "checksum failure_derive 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "ea1063915fd7ef4309e222a5a07cf9c319fb9c7836b1f89b85458672dbb127e1" "checksum fixedbitset 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "86d4de0081402f5e88cdac65c8dcdcc73118c1a7a465e2a05f0da05843a8ea33" "checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3" "checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" +"checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb" "checksum idna 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "38f09e0f0b1fb55fdee1f17470ad800da77af5186a1a76c026b679358b7e844e" "checksum lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bc5729f27f159ddd61f4df6228e827e86643d4d3e7c32183cb30a1c08f604a14" "checksum libc 0.2.57 (registry+https://github.com/rust-lang/crates.io-index)" = "a844cabbd5a77e60403a58af576f0a1baa83c3dd2670be63e615bd24fc58b82d" @@ -565,12 +710,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum lock_api 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ed946d4529956a20f2d63ebe1b69996d5a2137c91913fe3ebbeff957f5bca7ff" "checksum matches 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" "checksum memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2efc7bc57c883d4a4d6e3246905283d8dae951bb3bd32f49d6ef297f546e1c39" +"checksum memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0f9dc261e2b62d7a622bf416ea3c5245cdd5d9a7fcc428c0d06804dfce1775b3" +"checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945" "checksum nom 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cf51a729ecf40266a2368ad335a5fdde43471f545a967109cd62146ecf8b66ff" "checksum num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "6ba9a427cfca2be13aa6f6403b0b7e7368fe982bfa16fccc450ce74c46cd9b32" +"checksum num_cpus 1.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1a23f0ed30a54abaa0c7e83b1d2d87ada7c3c23078d1d87815af3e3b6385fbba" "checksum ordered-float 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "18869315e81473c951eb56ad5558bbc56978562d3ecfb87abb7a1e944cea4518" "checksum ordermap 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "a86ed3f5f244b372d6b1a00b72ef7f8876d0bc6a78a4c9985c53614041512063" "checksum parking_lot 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fa7767817701cce701d5585b9c4db3cdd02086398322c1d7e8bf5094a96a2ce7" "checksum parking_lot_core 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "cb88cb1cb3790baa6776844f968fea3be44956cf184fa1be5a03341f5491278c" +"checksum path-clean 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ecba01bf2678719532c5e3059e0b5f0811273d94b397088b82e3bd0a78c78fdd" "checksum percent-encoding 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "31010dd2e1ac33d5b46a5b413495239882813e0369f8ed8a5e266f173602f831" "checksum petgraph 0.4.13 (registry+https://github.com/rust-lang/crates.io-index)" = "9c3659d1ee90221741f65dd128d9998311b0e40c5d3c23a62445938214abce4f" "checksum pkg-config 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)" = "676e8eb2b1b4c9043511a9b7bea0915320d7e502b0a079fb03f9635a5252b18c" @@ -586,12 +735,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum rand_os 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "7b75f676a1e053fc562eafbb47838d67c84801e38fc1ba459e8f180deabd5071" "checksum rand_pcg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "abf9b09b01790cfe0364f52bf32995ea3c39f4d2dd011eac241d2914146d0b44" "checksum rand_xorshift 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c" +"checksum rayon 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "373814f27745b2686b350dd261bfd24576a6fb0e2c5919b3a2b6005f820b0473" +"checksum rayon-core 1.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b055d1e92aba6877574d8fe604a63c8b5df60f60e5982bf7ccbb1338ea527356" "checksum rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" "checksum redox_syscall 0.1.54 (registry+https://github.com/rust-lang/crates.io-index)" = "12229c14a0f65c4f1cb046a3b52047cdd9da1f4b30f8a39c5063c8bae515e252" "checksum regex 1.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "8f0a0bcab2fd7d1d7c54fa9eae6f43eddeb9ce2e7352f8518a814a4f65d60c58" "checksum regex-syntax 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)" = "dcfd8681eebe297b81d98498869d4aae052137651ad7b96822f09ceb690d0a96" "checksum rkv 0.9.5 (registry+https://github.com/rust-lang/crates.io-index)" = "2c1b8d667bf149bfac7c47bb728dfb7246f35fdf61c2f16f9f588194f087d23c" "checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" +"checksum scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "94258f53601af11e6a49f722422f6e3425c52b06245a5cf9bc09908b174f5e27" "checksum scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b42e15e59b18a828bbf5c58ea01debb36b9b096346de35d941dcb89009f24a0d" "checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" "checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" @@ -612,3 +764,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "f10e386af2b13e47c89e7236a7a14a086791a2b88ebad6df9bf42040195cf770" "checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" "checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +"checksum zstd 0.4.24+zstd.1.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2c5a6414958b49ee80f2dd0042023ac8f37cfe1d31fbeec0b9749cf6f2c03683" +"checksum zstd-safe 1.4.9+zstd.1.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d98332212af687878b146a6549c188e9b72971972d23089c831472f938e6272" +"checksum zstd-sys 1.4.10+zstd.1.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "46f433134fbd0c37c9eb5929733df5f34bcdff464722eb93155fcee93eb57652" diff --git a/Cargo.toml b/Cargo.toml index b69392b..75ecbb4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,3 +19,8 @@ exclude = [ tree_magic = { package = "tree_magic_fork", version="0.2" } regex = "1.1.6" rkv = "0.9.5" +cachedir = "0.1.1" +path-clean = "0.1.0" +bincode = "1.1.4" +serde = "1.0.92" +zstd = "0.4.24" diff --git a/src/adapters.rs b/src/adapters.rs index bc90b3b..f4ef9bd 100644 --- a/src/adapters.rs +++ b/src/adapters.rs @@ -13,7 +13,7 @@ use std::io::Write; use std::rc::Rc; pub enum Matcher { - MimeType(Regex), // todo: generic pattern? + // MimeType(Regex), FileName(Regex), } @@ -27,7 +27,7 @@ pub struct FileMeta { // filename is not actually a utf8 string, but since we can't do regex on OsStr and can't get a &[u8] from OsStr either, // and since we probably only want to do matching on ascii stuff anyways, this is the filename as a string with non-valid bytes removed pub lossy_filename: String, - pub mimetype: String, + // pub mimetype: String, } pub trait GetMetadata { @@ -51,26 +51,26 @@ pub fn init_adapters() -> Result Option ]; let mut fname_regexes = vec![]; - let mut mime_regexes = vec![]; + //let mut mime_regexes = vec![]; for adapter in adapters.into_iter() { let metadata = adapter.metadata(); for matcher in &metadata.matchers { match matcher { - Matcher::MimeType(re) => mime_regexes.push((re.clone(), adapter.clone())), + //Matcher::MimeType(re) => mime_regexes.push((re.clone(), adapter.clone())), Matcher::FileName(re) => fname_regexes.push((re.clone(), adapter.clone())), }; } } let fname_regex_set = RegexSet::new(fname_regexes.iter().map(|p| p.0.as_str()))?; - let mime_regex_set = RegexSet::new(mime_regexes.iter().map(|p| p.0.as_str()))?; + //let mime_regex_set = RegexSet::new(mime_regexes.iter().map(|p| p.0.as_str()))?; return Ok(move |meta: FileMeta| { // todo: handle multiple matches for m in fname_regex_set.matches(&meta.lossy_filename) { return Some(fname_regexes[m].1.clone()); } - for m in mime_regex_set.matches(&meta.mimetype) { + /*for m in mime_regex_set.matches(&meta.mimetype) { return Some(mime_regexes[m].1.clone()); - } + }*/ return None; }); } diff --git a/src/adapters/ffmpeg.rs b/src/adapters/ffmpeg.rs index 2675a59..9dc985b 100644 --- a/src/adapters/ffmpeg.rs +++ b/src/adapters/ffmpeg.rs @@ -15,7 +15,7 @@ impl FFmpegAdapter { pub fn new() -> FFmpegAdapter { FFmpegAdapter { _metadata: AdapterMeta { - name: "FFmpeg".to_owned(), + name: "ffmpeg".to_owned(), version: 1, matchers: extensions.iter().map(|s| ExtensionMatcher(s)).collect(), }, diff --git a/src/adapters/pandoc.rs b/src/adapters/pandoc.rs index 8064c6c..6a62c45 100644 --- a/src/adapters/pandoc.rs +++ b/src/adapters/pandoc.rs @@ -39,6 +39,7 @@ use std::process::Command; //"txt" -> Just "markdown" //"xhtml" -> Just "html" //"wiki" -> Just "mediawiki" + static extensions: &[&str] = &["epub", "odt", "docx", "pptx", "fb2", "icml", "rtf", "ipynb"]; pub struct PandocAdapter { @@ -67,7 +68,7 @@ impl SpawningFileAdapter for PandocAdapter { let mut cmd = Command::new("pandoc"); cmd // simpler markown (with more information loss but plainer text) - .arg("--to=markdown-header_attributes-link_attributes-fenced_divs-markdown_in_html_blocks-raw_html-native_divs-native_spans-bracketed_spans") + .arg("--to=commonmark-header_attributes-link_attributes-fenced_divs-markdown_in_html_blocks-raw_html-native_divs-native_spans-bracketed_spans") .arg("--wrap=none") .arg("--atx-headers") .arg("--") diff --git a/src/adapters/poppler.rs b/src/adapters/poppler.rs index 2a1e140..b7058f9 100644 --- a/src/adapters/poppler.rs +++ b/src/adapters/poppler.rs @@ -14,7 +14,7 @@ impl PopplerAdapter { pub fn new() -> PopplerAdapter { PopplerAdapter { _metadata: AdapterMeta { - name: "poppler pdftotext".to_owned(), + name: "poppler".to_owned(), version: 1, // todo: read from ffmpeg -demuxers? matchers: extensions.iter().map(|s| ExtensionMatcher(s)).collect(), diff --git a/src/bin/rga-preproc.rs b/src/bin/rga-preproc.rs index fc1eb56..a14009e 100644 --- a/src/bin/rga-preproc.rs +++ b/src/bin/rga-preproc.rs @@ -1,41 +1,101 @@ +use path_clean::PathClean; use rga::adapters::*; - +use rga::CachingWriter; +use serde::{Deserialize, Serialize}; use std::error::Error; use std::fmt; -use std::path::Path; +use std::io::Write; +use std::path::{Path, PathBuf}; use tree_magic; +const max_db_blob_len: usize = 2000000; + // lazy error fn lerr(inp: impl AsRef) -> Box { return inp.as_ref().into(); } +fn open_db() -> Result>, Box> { + let app_cache = cachedir::CacheDirConfig::new("rga").get_cache_dir()?; + + let db_arc = rkv::Manager::singleton() + .write() + .expect("could not write db manager") + .get_or_create(app_cache.as_path(), |p| { + let mut builder = rkv::Rkv::environment_builder(); + builder + .set_flags(rkv::EnvironmentFlags::NO_SYNC | rkv::EnvironmentFlags::WRITE_MAP) // not durable + .set_map_size(2 * 1024 * 1024 * 1024) + .set_max_dbs(100); + rkv::Rkv::from_env(p, builder) + }) + .expect("could not get/create db"); + Ok(db_arc) +} + fn main() -> Result<(), Box> { + //db. let adapters = init_adapters()?; let filepath = std::env::args() .skip(1) .next() .ok_or(lerr("No filename specified"))?; - println!("fname: {}", filepath); - let path = Path::new(&filepath); + eprintln!("fname: {}", filepath); + let path = PathBuf::from(&filepath); + let serialized_path: Vec = + bincode::serialize(&path.clean()).expect("could not serialize path"); let filename = path.file_name().ok_or(lerr("Empty filename"))?; - let mimetype = tree_magic::from_filepath(path).ok_or(lerr(format!( + /*let mimetype = tree_magic::from_filepath(path).ok_or(lerr(format!( "File {} does not exist", filename.to_string_lossy() )))?; - println!("mimetype: {:?}", mimetype); + println!("mimetype: {:?}", mimetype);*/ let adapter = adapters(FileMeta { - mimetype, + // mimetype, lossy_filename: filename.to_string_lossy().to_string(), }); match adapter { Some(ad) => { - println!("adapter: {}", &ad.metadata().name); - let stdouti = std::io::stdout(); - let mut stdout = stdouti.lock(); - ad.adapt(&filepath, &mut stdout)?; - Ok(()) + let meta = ad.metadata(); + eprintln!("adapter: {}", &meta.name); + let db_name = format!("{}.v{}", meta.name, meta.version); + let db_arc = open_db()?; + let db_env = db_arc.read().unwrap(); + let db = db_env + .open_single(db_name.as_str(), rkv::store::Options::create()) + .map_err(|p| lerr(format!("could not open db store: {:?}", p)))?; + let reader = db_env.read().expect("could not get reader"); + match db + .get(&reader, &serialized_path) + .map_err(|p| lerr(format!("could not read from db: {:?}", p)))? + { + Some(rkv::Value::Blob(cached)) => { + let stdouti = std::io::stdout(); + zstd::stream::copy_decode(cached, stdouti.lock())?; + Ok(()) + } + Some(_) => Err(lerr("Integrity: value not blob")), + None => { + let stdouti = std::io::stdout(); + let mut compbuf = CachingWriter::new(stdouti.lock(), max_db_blob_len, 12)?; + ad.adapt(&filepath, &mut compbuf)?; + let compressed = compbuf.finish()?; + if let Some(cached) = compressed { + eprintln!("compressed len: {}", cached.len()); + + { + let mut writer = db_env.write().map_err(|p| { + lerr(format!("could not open write handle to cache: {:?}", p)) + })?; + db.put(&mut writer, &serialized_path, &rkv::Value::Blob(&cached)) + .map_err(|p| lerr(format!("could not write to cache: {:?}", p)))?; + writer.commit().unwrap(); + } + } + Ok(()) + } + } } None => { eprintln!("no adapter for that file, running cat!"); diff --git a/src/caching_writer.rs b/src/caching_writer.rs new file mode 100644 index 0000000..bb1b74c --- /dev/null +++ b/src/caching_writer.rs @@ -0,0 +1,70 @@ +use std::io::Write; + +enum Sta<'t> { + ToZstd(Vec, zstd::stream::write::Encoder<&'t mut Vec>), +} + +/** + * wrap a writer so that it is passthrough, + * but also the written data is compressed and written into a buffer, unless more than X bytes is written + */ +pub struct CachingWriter { + max_cache_size: usize, + zstd_writer: Option>>, + out: W, +} +impl CachingWriter { + pub fn new( + out: W, + max_cache_size: usize, + compression_level: i32, + ) -> std::io::Result> { + Ok(CachingWriter { + out, + max_cache_size, + zstd_writer: Some(zstd::stream::write::Encoder::new( + Vec::new(), + compression_level, + )?), + }) + } + pub fn finish(self) -> std::io::Result>> { + if let Some(writer) = self.zstd_writer { + let res = writer.finish()?; + if res.len() <= self.max_cache_size { + Ok(Some(res)) + } else { + // drop cache + Ok(None) + } + } else { + Ok(None) + } + } +} +impl Write for CachingWriter { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + match self.zstd_writer.as_mut() { + Some(writer) => { + let wrote = writer.write(buf)?; + let compressed_len = writer.get_ref().len(); + eprintln!("wrote {} to zstd, len now {}", wrote, compressed_len); + if compressed_len > self.max_cache_size { + eprintln!("cache longer than max, dropping"); + //writer.finish(); + self.zstd_writer.take().unwrap().finish()?; + } + self.out.write_all(&buf[0..wrote])?; + return Ok(wrote); + } + None => self.out.write(buf), + } + } + fn flush(&mut self) -> std::io::Result<()> { + eprintln!("flushing"); + if let Some(writer) = self.zstd_writer.as_mut() { + writer.flush()?; + } + self.out.flush() + } +} diff --git a/src/lib.rs b/src/lib.rs index 4e9af61..b489eb7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1 +1,3 @@ pub mod adapters; +mod caching_writer; +pub use caching_writer::CachingWriter;