partial zip adapter, cleanup

This commit is contained in:
phiresky 2022-12-26 21:51:22 +01:00
parent b29e6dec0e
commit f624a6a29a
5 changed files with 270 additions and 346 deletions

415
Cargo.lock generated
View File

@ -8,18 +8,6 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "aes"
version = "0.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e8b47f52ea9bae42228d07ec09eb676433d7c4ed1ebdf0f1d1c29ed446f1ab8"
dependencies = [
"cfg-if",
"cipher",
"cpufeatures",
"opaque-debug",
]
[[package]]
name = "ahash"
version = "0.7.6"
@ -55,6 +43,15 @@ dependencies = [
"alloc-no-stdlib",
]
[[package]]
name = "android_system_properties"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
dependencies = [
"libc",
]
[[package]]
name = "ansi_term"
version = "0.12.1"
@ -119,6 +116,29 @@ dependencies = [
"syn",
]
[[package]]
name = "async_io_utilities"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b20cffc5590f4bf33f05f97a3ea587feba9c50d20325b401daa096b92ff7da0"
dependencies = [
"tokio 1.23.0",
]
[[package]]
name = "async_zip"
version = "0.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a36d43bdefc7215b2b3a97edd03b1553b7969ad76551025eedd3b913c645f6e"
dependencies = [
"async-compression",
"async_io_utilities",
"chrono",
"crc32fast",
"thiserror",
"tokio 1.23.0",
]
[[package]]
name = "atty"
version = "0.2.14"
@ -136,12 +156,6 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "base64ct"
version = "1.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b645a089122eccb6111b4f81cbc1a49f5900ac4666bb93ac027feaecf15607bf"
[[package]]
name = "bincode"
version = "1.3.3"
@ -157,15 +171,6 @@ version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "block-buffer"
version = "0.10.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69cce20737498f97b993470a6e536b8523f0af7892a4f928cceb1ac5e52ebe7e"
dependencies = [
"generic-array 0.14.6",
]
[[package]]
name = "brotli"
version = "3.3.4"
@ -187,6 +192,12 @@ dependencies = [
"alloc-stdlib",
]
[[package]]
name = "bumpalo"
version = "3.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba"
[[package]]
name = "bytecount"
version = "0.6.3"
@ -248,12 +259,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "cipher"
version = "0.3.0"
name = "chrono"
version = "0.4.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ee52072ec15386f770805afd189a01c8841be8696bed250fa2f13c4c0d6dfb7"
checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f"
dependencies = [
"generic-array 0.14.6",
"iana-time-zone",
"js-sys",
"num-integer",
"num-traits",
"time",
"wasm-bindgen",
"winapi",
]
[[package]]
@ -295,10 +312,14 @@ dependencies = [
]
[[package]]
name = "constant_time_eq"
version = "0.1.5"
name = "codespan-reporting"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc"
checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e"
dependencies = [
"termcolor",
"unicode-width",
]
[[package]]
name = "convert_case"
@ -307,13 +328,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e"
[[package]]
name = "cpufeatures"
version = "0.2.5"
name = "core-foundation-sys"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28d997bd5e24a5928dd43e46dc529867e207907fe0b239c3477d924f7f2ca320"
dependencies = [
"libc",
]
checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc"
[[package]]
name = "crc32fast"
@ -391,16 +409,6 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "crypto-common"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
dependencies = [
"generic-array 0.14.6",
"typenum",
]
[[package]]
name = "ctor"
version = "0.1.26"
@ -411,6 +419,50 @@ dependencies = [
"syn",
]
[[package]]
name = "cxx"
version = "1.0.85"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5add3fc1717409d029b20c5b6903fc0c0b02fa6741d820054f4a2efa5e5816fd"
dependencies = [
"cc",
"cxxbridge-flags",
"cxxbridge-macro",
"link-cplusplus",
]
[[package]]
name = "cxx-build"
version = "1.0.85"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4c87959ba14bc6fbc61df77c3fcfe180fc32b93538c4f1031dd802ccb5f2ff0"
dependencies = [
"cc",
"codespan-reporting",
"once_cell",
"proc-macro2",
"quote",
"scratch",
"syn",
]
[[package]]
name = "cxxbridge-flags"
version = "1.0.85"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69a3e162fde4e594ed2b07d0f83c6c67b745e7f28ce58c6df5e6b6bef99dfb59"
[[package]]
name = "cxxbridge-macro"
version = "1.0.85"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e7e2adeb6a0d4a282e581096b06e1791532b7d576dcde5ccd9382acf55db8e6"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "derive_more"
version = "0.99.17"
@ -430,17 +482,6 @@ version = "0.1.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8"
[[package]]
name = "digest"
version = "0.10.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f"
dependencies = [
"block-buffer",
"crypto-common",
"subtle",
]
[[package]]
name = "directories-next"
version = "2.0.0"
@ -663,16 +704,6 @@ dependencies = [
"typenum",
]
[[package]]
name = "generic-array"
version = "0.14.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9"
dependencies = [
"typenum",
"version_check",
]
[[package]]
name = "getopts"
version = "0.2.21"
@ -690,7 +721,7 @@ checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31"
dependencies = [
"cfg-if",
"libc",
"wasi",
"wasi 0.11.0+wasi-snapshot-preview1",
]
[[package]]
@ -744,21 +775,36 @@ dependencies = [
"libc",
]
[[package]]
name = "hmac"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
dependencies = [
"digest",
]
[[package]]
name = "humantime"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
[[package]]
name = "iana-time-zone"
version = "0.1.53"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765"
dependencies = [
"android_system_properties",
"core-foundation-sys",
"iana-time-zone-haiku",
"js-sys",
"wasm-bindgen",
"winapi",
]
[[package]]
name = "iana-time-zone-haiku"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca"
dependencies = [
"cxx",
"cxx-build",
]
[[package]]
name = "id-arena"
version = "2.2.1"
@ -832,6 +878,15 @@ dependencies = [
"libc",
]
[[package]]
name = "js-sys"
version = "0.3.60"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49409df3e3bf0856b916e2ceaca09ee28e6871cf7d9ce97a692cacfdb2a25a47"
dependencies = [
"wasm-bindgen",
]
[[package]]
name = "json_comments"
version = "0.2.1"
@ -861,6 +916,15 @@ dependencies = [
"vcpkg",
]
[[package]]
name = "link-cplusplus"
version = "1.0.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5"
dependencies = [
"cc",
]
[[package]]
name = "linux-raw-sys"
version = "0.1.4"
@ -958,7 +1022,7 @@ checksum = "e5d732bc30207a6423068df043e3d02e0735b155ad7ce1a6f76fe2baa5b158de"
dependencies = [
"libc",
"log",
"wasi",
"wasi 0.11.0+wasi-snapshot-preview1",
"windows-sys",
]
@ -1052,12 +1116,6 @@ version = "1.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "86f0b0d4bf799edbc74508c1e8bf170ff5f41238e5f8225603ca7caaae2b7860"
[[package]]
name = "opaque-debug"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5"
[[package]]
name = "ordered-float"
version = "3.4.0"
@ -1105,17 +1163,6 @@ dependencies = [
"windows-sys",
]
[[package]]
name = "password-hash"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7676374caaee8a325c9e7a2ae557f216c5563a171d6997b0ef8a65af35147700"
dependencies = [
"base64ct",
"rand_core",
"subtle",
]
[[package]]
name = "paste"
version = "1.0.11"
@ -1128,18 +1175,6 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ecba01bf2678719532c5e3059e0b5f0811273d94b397088b82e3bd0a78c78fdd"
[[package]]
name = "pbkdf2"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83a0692ec44e4cf1ef28ca317f14f8f07da2d95ec3fa01f86e4467b725e60917"
dependencies = [
"digest",
"hmac",
"password-hash",
"sha2",
]
[[package]]
name = "percent-encoding"
version = "2.2.0"
@ -1244,12 +1279,6 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
[[package]]
name = "redox_syscall"
version = "0.2.16"
@ -1303,6 +1332,7 @@ dependencies = [
"anyhow",
"async-compression",
"async-stream",
"async_zip",
"bincode",
"bytes 1.3.0",
"clap 4.0.32",
@ -1340,7 +1370,6 @@ dependencies = [
"tokio-test",
"tokio-util",
"tree_magic_mini",
"zip",
]
[[package]]
@ -1440,6 +1469,12 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]]
name = "scratch"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ddccb15bcce173023b3fedd9436f882a0739b8dfb45e4f6b6002bee5929f61b2"
[[package]]
name = "semver"
version = "1.0.16"
@ -1448,18 +1483,18 @@ checksum = "58bc9567378fc7690d6b2addae4e60ac2eeea07becb2c64b9f218b53865cba2a"
[[package]]
name = "serde"
version = "1.0.151"
version = "1.0.152"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97fed41fc1a24994d044e6db6935e69511a1153b52c15eb42493b26fa87feba0"
checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.151"
version = "1.0.152"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "255abe9a125a985c05190d687b320c12f9b1f0b99445e608c21ba0782c719ad8"
checksum = "af487d118eecd09402d70a5d72551860e788df87b464af30e5ea6a38c75c541e"
dependencies = [
"proc-macro2",
"quote",
@ -1488,28 +1523,6 @@ dependencies = [
"serde",
]
[[package]]
name = "sha1"
version = "0.10.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3"
dependencies = [
"cfg-if",
"cpufeatures",
"digest",
]
[[package]]
name = "sha2"
version = "0.10.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0"
dependencies = [
"cfg-if",
"cpufeatures",
"digest",
]
[[package]]
name = "signal-hook-registry"
version = "1.4.0"
@ -1525,7 +1538,7 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ed5f6ab2122c6dec69dca18c72fa4590a27e581ad20d44960fe74c032a0b23b"
dependencies = [
"generic-array 0.12.4",
"generic-array",
"num",
]
@ -1590,12 +1603,6 @@ dependencies = [
"syn",
]
[[package]]
name = "subtle"
version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601"
[[package]]
name = "syn"
version = "1.0.107"
@ -1671,29 +1678,13 @@ dependencies = [
[[package]]
name = "time"
version = "0.3.17"
version = "0.1.45"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a561bf4617eebd33bca6434b988f39ed798e527f51a1e797d0ee4f61c0a38376"
checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a"
dependencies = [
"itoa",
"serde",
"time-core",
"time-macros",
]
[[package]]
name = "time-core"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd"
[[package]]
name = "time-macros"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d967f99f534ca7e495c575c62638eebc2898a8c84c119b89e250477bc4ba16b2"
dependencies = [
"time-core",
"libc",
"wasi 0.10.0+wasi-snapshot-preview1",
"winapi",
]
[[package]]
@ -1927,12 +1918,72 @@ version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
[[package]]
name = "wasi"
version = "0.10.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
[[package]]
name = "wasi"
version = "0.11.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
[[package]]
name = "wasm-bindgen"
version = "0.2.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eaf9f5aceeec8be17c128b2e93e031fb8a4d469bb9c4ae2d7dc1888b26887268"
dependencies = [
"cfg-if",
"wasm-bindgen-macro",
]
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c8ffb332579b0557b52d268b91feab8df3615f265d5270fec2a8c95b17c1142"
dependencies = [
"bumpalo",
"log",
"once_cell",
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "052be0f94026e6cbc75cdefc9bae13fd6052cdcaf532fa6c45e7ae33a1e6c810"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
]
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07bc0c051dc5f23e307b13285f9d75df86bfdf816c5721e573dec1f9b8aa193c"
dependencies = [
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1c38c045535d93ec4f0b4defec448e4291638ee608530863b1e2ba115d4fff7f"
[[package]]
name = "winapi"
version = "0.3.9"
@ -2045,26 +2096,6 @@ version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09041cd90cf85f7f8b2df60c646f853b7f535ce68f85244eb6731cf89fa498ec"
[[package]]
name = "zip"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "537ce7411d25e54e8ae21a7ce0b15840e7bfcff15b51d697ec3266cc76bdf080"
dependencies = [
"aes",
"byteorder",
"bzip2",
"constant_time_eq",
"crc32fast",
"crossbeam-utils",
"flate2",
"hmac",
"pbkdf2",
"sha1",
"time",
"zstd",
]
[[package]]
name = "zstd"
version = "0.11.2+zstd.1.5.2"

View File

@ -19,6 +19,7 @@ version = "0.9.7-alpha.0"
anyhow = "1.0.32"
async-compression = {version = "0.3.15", features = ["all", "all-algorithms", "tokio"]}
async-stream = "0.3.3"
async_zip = "0.0.9"
bincode = "1.3.1"
bytes = "1.2.1"
clap = {version = "4.0.18", features = ["wrap_help"]}
@ -53,7 +54,6 @@ tokio-stream = {version = "0.1.11", features = ["io-util", "tokio-util"]}
tokio-tar = { git = "https://github.com/vorot93/tokio-tar", version = "0.3.0" }
tokio-util = {version = "0.7.4", features = ["io", "full"]}
tree_magic = {package = "tree_magic_mini", version = "3.0.0"}
zip = "0.6.3"
[dev-dependencies]
ctor = "0.1.20"

View File

@ -2,11 +2,9 @@ pub mod custom;
pub mod decompress;
// pub mod ffmpeg;
pub mod postproc;
// pub mod pdfpages;
use std::sync::Arc;
// pub mod sqlite;
pub mod tar;
// pub mod tesseract;
// pub mod writing;
// pub mod zip;
use crate::{adapted_iter::AdaptedFilesIterBox, config::RgaConfig, matching::*};
@ -119,8 +117,6 @@ pub fn get_all_adapters(custom_adapters: Option<Vec<CustomAdapterConfig>>) -> Ad
Arc::new(decompress::DecompressAdapter::new()),
Arc::new(tar::TarAdapter::new()),
//Rc::new(sqlite::SqliteAdapter::new()),
// Rc::new(pdfpages::PdfPagesAdapter::new()),
// Rc::new(tesseract::TesseractAdapter::new()),
];
adapters.extend(
BUILTIN_SPAWNING_ADAPTERS

View File

@ -1,140 +0,0 @@
use super::*;
use crate::adapters::spawning::map_exe_error;
use crate::preproc::rga_preproc;
use lazy_static::lazy_static;
use log::*;
use std::fs::File;
use std::io::BufReader;
use std::path::PathBuf;
use std::process::Command;
static EXTENSIONS: &[&str] = &["pdf"];
lazy_static! {
static ref METADATA: AdapterMeta = AdapterMeta {
name: "pdfpages".to_owned(),
version: 1,
description: "Converts a pdf to its individual pages as png files. Only useful in combination with tesseract".to_owned(),
recurses: true,
fast_matchers: EXTENSIONS
.iter()
.map(|s| FastFileMatcher::FileExtension(s.to_string()))
.collect(),
slow_matchers: Some(vec![FileMatcher::MimeType(
"application/pdf".to_owned()
)]),
keep_fast_matchers_if_accurate: true,
disabled_by_default: true
};
}
#[derive(Default)]
pub struct PdfPagesAdapter {}
impl PdfPagesAdapter {
pub fn new() -> PdfPagesAdapter {
PdfPagesAdapter {}
}
}
impl GetMetadata for PdfPagesAdapter {
fn metadata(&self) -> &AdapterMeta {
&METADATA
}
}
/// A pdf is basically converted to a zip that has Page X.png files.
/// This way, something like tesseract can process the pages individually
impl FileAdapter for PdfPagesAdapter {
fn adapt(&self, ai: AdaptInfo, _detection_reason: &FileMatcher) -> Result<()> {
let AdaptInfo {
filepath_hint,
is_real_file,
oup,
line_prefix,
archive_recursion_depth,
config,
..
} = ai;
if !is_real_file {
// todo: read to memory and then use that blob if size < max
writeln!(oup, "{}[rga: skipping pdfpages in archive]", line_prefix,)?;
return Ok(());
}
let inp_fname = filepath_hint;
let exe_name = "gm";
let out_dir = tempfile::Builder::new().prefix("pdfpages-").tempdir()?;
let out_fname = out_dir.path().join("out%04d.png");
debug!("writing to temp dir: {}", out_fname.display());
let mut cmd = Command::new(exe_name);
cmd.arg("convert")
.arg("-density")
.arg("200")
.arg(inp_fname)
.arg("+adjoin")
.arg(out_fname);
let mut cmd = cmd
.spawn()
.map_err(|e| map_exe_error(e, exe_name, "Make sure you have graphicsmagick installed."))?;
let args = config.args;
let status = cmd.wait()?;
if status.success() {
} else {
return Err(format_err!("subprocess failed: {:?}", status));
}
for (i, filename) in glob::glob(
out_dir
.path()
.join("out*.png")
.to_str()
.expect("temp path has invalid encoding"),
)?
.enumerate()
{
let mut ele = BufReader::new(File::open(filename?)?);
rga_preproc(AdaptInfo {
filepath_hint: &PathBuf::from(format!("Page {}.png", i + 1)),
is_real_file: false,
inp: &mut ele,
oup,
line_prefix: &format!("{}Page {}:", line_prefix, i + 1),
archive_recursion_depth: archive_recursion_depth + 1,
config: PreprocConfig { cache: None, args },
})?;
}
Ok(())
}
}
/*// todo: do this in an actually streaming fashion and less slow
// IEND chunk + PDF magic
// 4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a
let split_seq = hex_literal::hex!("4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a");
let split_seq_inx = 8;
fn split_by_seq<'a>(
split_seq: &'a [u8],
split_inx: usize,
read: &mut Read,
) -> Result<impl IntoIterator<Item = impl Read> + 'a> {
let regex = split_seq
.iter()
.map(|c| format!("\\x{:0>2x}", c))
.collect::<Vec<_>>()
.join("");
let restr = format!("(?-u){}", regex);
eprintln!("re: {}", restr);
let re = regex::bytes::Regex::new(&restr)?;
let mut all = Vec::new();
read.read_to_end(&mut all)?;
let mut out: Vec<Cursor<Vec<u8>>> = Vec::new();
let mut last = 0;
for (i, split) in re.find_iter(&all).enumerate() {
let pos = split.start() + split_inx;
out.push(Cursor::new(Vec::from(&all[last..pos])));
last = pos;
}
out.push(Cursor::new(Vec::from(&all[last..])));
Ok(out)
}*/

View File

@ -1,6 +1,8 @@
use super::*;
use crate::{adapted_iter::AdaptedFilesIter, print_bytes};
use anyhow::*;
use async_stream::stream;
use async_zip::read::stream::ZipFileReader;
use lazy_static::lazy_static;
use log::*;
@ -36,17 +38,52 @@ impl GetMetadata for ZipAdapter {
}
impl FileAdapter for ZipAdapter {
fn adapt<'a>(
&self,
inp: AdaptInfo<'a>,
_detection_reason: &FileMatcher,
) -> Result<Box<dyn AdaptedFilesIter + 'a>> {
Ok(Box::new(ZipAdaptIter { inp }))
fn adapt(&self, ai: AdaptInfo, _detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox> {
let AdaptInfo {
inp,
filepath_hint,
archive_recursion_depth,
postprocess,
line_prefix,
config,
..
} = ai;
let mut zip = ZipFileReader::new(inp);
let s = stream! {
while !zip.finished() {
if let Some(mut reader) = zip.entry_reader().await? {
let file = reader.entry();
/* if file.is_dir() {
continue;
}*/
debug!(
"{}{}|{}: {} ({} packed)",
line_prefix,
filepath_hint.display(),
file.filename(),
print_bytes(file.uncompressed_size() as f64),
print_bytes(file.compressed_size() as f64)
);
let new_line_prefix = format!("{}{}: ", line_prefix, file.filename());
yield Ok(AdaptInfo {
filepath_hint: PathBuf::from(file.filename()),
is_real_file: false,
inp: Box::pin(reader),
line_prefix: new_line_prefix,
archive_recursion_depth: archive_recursion_depth + 1,
postprocess,
config: config.clone(),
});
}
}
};
Ok(Box::pin(s))
}
}
struct ZipAdaptIter<'a> {
inp: AdaptInfo<'a>,
/*struct ZipAdaptIter {
inp: AdaptInfo,
}
impl<'a> AdaptedFilesIter for ZipAdaptIter<'a> {
fn next<'b>(&'b mut self) -> Option<AdaptInfo<'b>> {
@ -80,7 +117,7 @@ impl<'a> AdaptedFilesIter for ZipAdaptIter<'a> {
})
})
}
}
}*/
#[cfg(test)]
mod test {