Merge remote-tracking branch 'upstream/master' into mbox-extractor

2024-11-08 14:00:37 +00:00 · 2023-07-31 14:34:18 +02:00 · 2023-07-31 14:34:18 +02:00 · 2259730c67
commit 2259730c67
parent 9642552fa3 16b2059851
26 changed files with 1358 additions and 1298 deletions
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@ -0,0 +1,27 @@
 ---
 name: Bug report
 about: Create a report to help us improve
 title: ''
 labels: bug
 assignees: ''
 ---
 **Describe the bug**
 **To Reproduce**
 Attach example file:
 Run command:
 **Output**
 **Screenshots**
 If applicable, add screenshots to help explain your problem.
 **Operating System and Version**
 **Output of `rga --version`**
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@ -0,0 +1,20 @@
 ---
 name: Feature request
 about: Suggest an idea for this project
 title: ''
 labels: ''
 assignees: ''
 ---
 **Is your feature request related to a problem? Please describe.**
 A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
 **Describe the solution you'd like**
 A clear and concise description of what you want to happen.
 **Describe alternatives you've considered**
 A clear and concise description of any alternative solutions or features you've considered.
 **Additional context**
 Add any other context or screenshots about the feature request here.
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -1,75 +1,25 @@
 # Based on https://github.com/actions-rs/meta/blob/master/recipes/quickstart.md
 #
-# While our "example" application has the platform-specific code,
+# While our "example" application has platform-specific code,
-# for simplicity we are compiling and testing everything on the Ubuntu environment only.
+# for simplicity we are compiling and testing everything in a nix-on-Linux environment only.
 # For multi-OS testing see the `cross.yml` workflow.
 on: [push, pull_request]
 name: ci
 jobs:
-  check:
+  nix-flake-check:
-    name: Check
+    name: nix flake check
    runs-on: ubuntu-latest
    steps:
      - name: Checkout sources
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
-      - name: Install stable toolchain
+      - name: Install nix
-        uses: actions-rs/toolchain@v1
+        uses: cachix/install-nix-action@v21
        with:
          profile: minimal
          toolchain: stable
          override: true
-      - name: Run cargo check
+      - name: Ensure the build succeeds
-        uses: actions-rs/cargo@v1
+        run: nix build
        with:
          command: check
-  test:
+      - name: Run `nix flake check` to run formatters, linters, and tests
-    name: Test Suite
+        run: nix flake check --print-build-logs
    runs-on: ubuntu-latest
    steps:
      - name: Checkout sources
        uses: actions/checkout@v2
      - name: Install stable toolchain
        uses: actions-rs/toolchain@v1
        with:
          profile: minimal
          toolchain: stable
          override: true
      - name: Run cargo test
        uses: actions-rs/cargo@v1
        with:
          command: test
  lints:
    name: Lints
    runs-on: ubuntu-latest
    steps:
      - name: Checkout sources
        uses: actions/checkout@v2
      - name: Install stable toolchain
        uses: actions-rs/toolchain@v1
        with:
          profile: minimal
          toolchain: stable
          override: true
          components: rustfmt, clippy
      - name: Run cargo fmt
        uses: actions-rs/cargo@v1
        with:
          command: fmt
          args: --all -- --check
      - name: Run cargo clippy
        uses: actions-rs/cargo@v1
        with:
          command: clippy
          args: -- -D warnings
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -18,7 +18,7 @@ on:
    # branches:
    # - ag/release
    tags:
-    - 'v[0-9]+.[0-9]+.[0-9]+'
+      - "v[0-9]+.[0-9]+.[0-9]+*"
 jobs:
  create-release:
    name: create-release
@ -62,7 +62,7 @@ jobs:
  build-release:
    name: build-release
-    needs: ['create-release']
+    needs: ["create-release"]
    runs-on: ${{ matrix.os }}
    env:
      # For some builds, we use cross to test on 32-bit and big-endian
@ -79,11 +79,11 @@ jobs:
        build: [linux, linux-arm, macos, win-msvc]
        include:
          - build: linux
-          os: ubuntu-18.04
+            os: ubuntu-22.04
            rust: nightly
            target: x86_64-unknown-linux-musl
          - build: linux-arm
-          os: ubuntu-18.04
+            os: ubuntu-22.04
            rust: nightly
            target: arm-unknown-linux-gnueabihf
          - build: macos
@ -106,7 +106,7 @@ jobs:
          fetch-depth: 1
      - name: Install packages (Ubuntu)
-      if: matrix.os == 'ubuntu-18.04'
+        if: matrix.os == 'ubuntu-22.04'
        run: |
          ci/ubuntu-install-packages
@ -124,7 +124,7 @@ jobs:
          target: ${{ matrix.target }}
      - name: Use Cross
-      # if: matrix.os != 'windows-2019'
+        shell: bash
        run: |
          cargo install cross
          echo "CARGO=cross" >> $GITHUB_ENV
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -2,7 +2,7 @@
 [package]
 authors = ["phiresky <phireskyde+git@gmail.com>"]
 description = "rga: ripgrep, but also search in PDFs, E-Books, Office documents, zip, tar.gz, etc."
-edition = "2018"
+edition = "2021"
 exclude = [
  "exampledir/*",
 ]
@ -11,56 +11,57 @@ license = "AGPL-3.0-or-later"
 name = "ripgrep_all"
 readme = "README.md"
 repository = "https://github.com/phiresky/ripgrep-all"
-version = "1.0.0-alpha.2"
+version = "1.0.0-alpha.5"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 [dependencies]
-anyhow = "1.0.32"
+anyhow = {version = "1.0.71", features = ["backtrace"]}
-async-compression = {version = "0.3.15", features = ["all", "all-algorithms", "tokio"]}
+async-compression = { version = "0.4.0", features = ["all", "all-algorithms", "tokio"] }
-async-stream = "0.3.3"
+async-stream = "0.3.5"
-async-trait = "0.1.64"
+async-trait = "0.1.68"
-async_zip = "0.0.9"
+async_zip = {version = "0.0.12", features = ["full"]}
-bincode = "1.3.1"
+bincode = "1.3.3"
-bytes = "1.2.1"
+bytes = "1.4.0"
-clap = {version = "4.0.18", features = ["wrap_help"]}
+clap = {version = "4.3.0", features = ["wrap_help"]}
-crossbeam = "0.8.1"
+crossbeam = "0.8.2"
-crossbeam-channel = "0.5.1"
+crossbeam-channel = "0.5.8"
-derive_more = "0.99.9"
+derive_more = "0.99.17"
 directories-next = "2.0.0"
 dyn-clonable = "0.9.0"
-dyn-clone = "1.0.2"
+dyn-clone = "1.0.11"
-encoding_rs = "0.8.24"
+encoding_rs = "0.8.32"
 encoding_rs_io = "0.1.7"
-env_logger = "0.9.0"
+env_logger = "0.10.0"
-glob = "0.3.0"
+glob = "0.3.1"
 json_comments = "0.2.1"
 lazy_static = "1.4.0"
-log = "0.4.11"
+log = "0.4.17"
 mailbox = "0.2.0"
 mailparse = "0.14.0"
-memchr = "2.3.3"
+memchr = "2.5.0"
 mime2ext = "0.1.52"
-paste = "1.0.0"
+paste = "1.0.12"
-path-clean = "0.1.0"
+path-clean = "1.0.1"
 pretty-bytes = "0.2.2"
-regex = "1.3.9"
+regex = "1.8.2"
-rkv = "0.17"
+rusqlite = {version = "0.29.0", features = ["vtab", "bundled"]}
-rusqlite = {version = "0.28.0", features = ["vtab", "bundled"]}
+schemars = {version = "0.8.12", features = ["preserve_order"]}
-schemars = {version = "0.8.0-alpha-4", features = ["preserve_order"]}
+serde = {version = "1.0.163", features = ["derive"]}
-serde = {version = "1.0.115", features = ["derive"]}
+serde_json = "1.0.96"
 serde_json = "1.0.57"
 size_format = "1.0.2"
-structopt = "0.3.17"
+structopt = "0.3.26"
-tempfile = "3.1.0"
+tempfile = "3.5.0"
-tokio = {version = "1.21.2", features = ["full"]}
+tokio = {version = "1.28.1", features = ["full"]}
-tokio-stream = {version = "0.1.11", features = ["io-util", "tokio-util"]}
+tokio-rusqlite = "0.4.0"
 tokio-stream = {version = "0.1.14", features = ["io-util", "tokio-util"]}
 tokio-tar = { git = "https://github.com/vorot93/tokio-tar", version = "0.3.0" }
-tokio-util = {version = "0.7.4", features = ["io", "full"]}
+tokio-util = {version = "0.7.8", features = ["io", "full"]}
-tree_magic = {package = "tree_magic_mini", version = "3.0.0"}
+tree_magic = {package = "tree_magic_mini", version = "3.0.3"}
 [dev-dependencies]
-async-recursion = "1.0.0"
+async-recursion = "1.0.4"
-ctor = "0.1.20"
+ctor = "0.2.0"
 pretty_assertions = "1.3.0"
 tempfile = "3.5.0"
 tokio-test = "0.4.2"
--- a/README.md
+++ b/README.md
@ -33,45 +33,7 @@ demo/
 ![rga-fzf](doc/rga-fzf.gif)
-You can use rga interactively via fzf. Add the following to your ~/.{bash,zsh}rc:
+See [the wiki](https://github.com/phiresky/ripgrep-all/wiki/fzf-Integration) for instructions of integrating rga with fzf.
 ```bash
 rga-fzf() {
 	RG_PREFIX="rga --files-with-matches"
 	local file
 	file="$(
 		FZF_DEFAULT_COMMAND="$RG_PREFIX '$1'" \
 			fzf --sort --preview="[[ ! -z {} ]] && rga --pretty --context 5 {q} {}" \
 				--phony -q "$1" \
 				--bind "change:reload:$RG_PREFIX {q}" \
 				--preview-window="70%:wrap"
 	)" &&
 	echo "opening $file" &&
 	xdg-open "$file"
 }
 ```
 And for your `~/.config/fish/config.fish`:
 ```
 function rga-fzf
    set RG_PREFIX 'rga --files-with-matches'
    if test (count $argv) -gt 1
        set RG_PREFIX "$RG_PREFIX $argv[1..-2]"
    end
    set -l file $file
    set file (
        FZF_DEFAULT_COMMAND="$RG_PREFIX '$argv[-1]'" \
        fzf --sort \
            --preview='test ! -z {} && \
                rga --pretty --context 5 {q} {}' \
            --phony -q "$argv[-1]" \
            --bind "change:reload:$RG_PREFIX {q}" \
            --preview-window='50%:wrap'
    ) && \
    echo "opening $file" && \
    open "$file"
 end
 ```
 ## INSTALLATION
@ -86,9 +48,11 @@ Linux x64, macOS and Windows binaries are available [in GitHub Releases][latestr
 `pacman -S ripgrep-all`.
 #### Nix
 `nix-env -iA nixpkgs.ripgrep-all`
 #### Debian-based
 download the [rga binary][latestrelease] and get the dependencies like this:
 `apt install ripgrep pandoc poppler-utils ffmpeg`
@ -117,7 +81,7 @@ If you get an error like `VCRUNTIME140.DLL could not be found`, you need to inst
 To install the dependencies that are each not strictly necessary but very useful:
-`brew install pandoc poppler tesseract ffmpeg`
+`brew install pandoc poppler ffmpeg`
 ### Compile from source
@ -131,58 +95,58 @@ rga should compile with stable Rust (v1.36.0+, check with `rustc --version`). To
 ## Available Adapters
 rga works with _adapters_ that adapt various file formats. It comes with a few adapters integrated:
 ```
 rga --rga-list-adapters
 ```
 You can also add **custom adapters**. See [the wiki](https://github.com/phiresky/ripgrep-all/wiki) for more information.
 <!-- this part generated by update-readme.sh -->
 Adapters:
-   **ffmpeg**
+- **pandoc**
-    Uses ffmpeg to extract video metadata/chapters and subtitles.  
+  Uses pandoc to convert binary/unreadable text documents to plain markdown-like text
-     Extensions: `.mkv`, `.mp4`, `.avi`
+  Runs: pandoc --from= --to=plain --wrap=none --markdown-headings=atx  
-
+   Extensions: .epub, .odt, .docx, .fb2, .ipynb
 *   **pandoc**
    Uses pandoc to convert binary/unreadable text documents to plain markdown-like text.  
     Extensions: `.epub`, `.odt`, `.docx`, `.fb2`, `.ipynb`
 - **poppler**
-    Uses pdftotext (from poppler-utils) to extract plain text from PDF files.  
+  Uses pdftotext (from poppler-utils) to extract plain text from PDF files
-     Extensions: `.pdf`  
+  Runs: pdftotext - -  
-     Mime Types: `application/pdf`
+   Extensions: .pdf  
   Mime Types: application/pdf
 - **postprocpagebreaks**
  Adds the page number to each line for an input file that specifies page breaks as ascii page break character.
  Mainly to be used internally by the poppler adapter.  
   Extensions: .asciipagebreaks
 - **ffmpeg**
  Uses ffmpeg to extract video metadata/chapters, subtitles, lyrics, and other metadata  
   Extensions: .mkv, .mp4, .avi, .mp3, .ogg, .flac, .webm
 - **zip**
-    Reads a zip file as a stream and recurses down into its contents.  
+  Reads a zip file as a stream and recurses down into its contents  
-     Extensions: `.zip`  
+   Extensions: .zip, .jar  
-     Mime Types: `application/zip`
+   Mime Types: application/zip
 - **decompress**
  Reads compressed file as a stream and runs a different extractor on the contents.  
-     Extensions: `.tgz`, `.tbz`, `.tbz2`, `.gz`, `.bz2`, `.xz`, `.zst`  
+   Extensions: .tgz, .tbz, .tbz2, .gz, .bz2, .xz, .zst  
-     Mime Types: `application/gzip`, `application/x-bzip`, `application/x-xz`, `application/zstd`
+   Mime Types: application/gzip, application/x-bzip, application/x-xz, application/zstd
 - **tar**
-    Reads a tar file as a stream and recurses down into its contents.  
+  Reads a tar file as a stream and recurses down into its contents  
-     Extensions: `.tar`
+   Extensions: .tar
-*   **sqlite**
+- **sqlite**
-    Uses sqlite bindings to convert sqlite databases into a simple plain text format.  
+  Uses sqlite bindings to convert sqlite databases into a simple plain text format  
-     Extensions: `.db`, `.db3`, `.sqlite`, `.sqlite3`  
+   Extensions: .db, .db3, .sqlite, .sqlite3  
-     Mime Types: `application/x-sqlite3`
+   Mime Types: application/x-sqlite3
-The following adapters are disabled by default, and can be enabled using `--rga-adapters=+pdfpages,tesseract`:
+The following adapters are disabled by default, and can be enabled using '--rga-adapters=+foo,bar':
 -   **pdfpages**
    Converts a pdf to its individual pages as png files. Only useful in combination with tesseract.  
     Extensions: `.pdf`  
     Mime Types: `application/pdf`
 -   **tesseract**
    Uses tesseract to run OCR on images to make them searchable.
    May need `-j1` to prevent overloading the system.
    Make sure you have tesseract installed.  
     Extensions: `.jpg`, `.png`
 ## USAGE:
@ -202,6 +166,17 @@ The following adapters are disabled by default, and can be enabled using `--rga-
 > Detection is only done on the first 8KiB of the file, since we can\'t
 > always seek on the input (in archives).
 **\--rga-no-cache**
 > Disable caching of results
 >
 > By default, rga caches the extracted text, if it is small enough, to a
 > database in \${XDG*CACHE_DIR-\~/.cache}/ripgrep-all on Linux,
 > *\~/Library/Caches/ripgrep-all\_ on macOS, or
 > C:\\Users\\username\\AppData\\Local\\ripgrep-all on Windows. This way,
 > repeated searches on the same set of files will be much faster. If you
 > pass this flag, all caching will be disabled.
 **-h**, **\--help**
 > Prints help information
@ -210,15 +185,9 @@ The following adapters are disabled by default, and can be enabled using `--rga-
 > List all known adapters
-**\--rga-no-cache**
+**\--rga-print-config-schema**
-> Disable caching of results
+> Print the JSON Schema of the configuration file
 >
 > By default, rga caches the extracted text, if it is small enough, to a
 > database in \~/.cache/rga on Linux, _\~/Library/Caches/rga_ on macOS,
 > or C:\\Users\\username\\AppData\\Local\\rga on Windows. This way,
 > repeated searches on the same set of files will be much faster. If you
 > pass this flag, all caching will be disabled.
 **\--rg-help**
@ -242,25 +211,32 @@ The following adapters are disabled by default, and can be enabled using `--rga-
 > use all default adapters except for bar and baz. \"+bar,baz\" means
 > use all default adapters and also bar and baz.
-**\--rga-cache-compression-level=**\<cache-compression-level\>
+**\--rga-cache-compression-level=**\<compression-level\>
 > ZSTD compression level to apply to adapter outputs before storing in
 > cache db
 >
 > Ranges from 1 - 22 \[default: 12\]
-**\--rga-cache-max-blob-len=**\<cache-max-blob-len\>
+**\--rga-config-file=**\<config-file-path\>
 > Max compressed size to cache
 >
 > Longest byte length (after compression) to store in cache. Longer
 > adapter outputs will not be cached and recomputed every time. Allowed
 > suffixes: k M G \[default: 2000000\]
 **\--rga-max-archive-recursion=**\<max-archive-recursion\>
 > Maximum nestedness of archives to recurse into \[default: 4\]
 **\--rga-cache-max-blob-len=**\<max-blob-len\>
 > Max compressed size to cache
 >
 > Longest byte length (after compression) to store in cache. Longer
 > adapter outputs will not be cached and recomputed every time.
 >
 > Allowed suffixes on command line: k M G \[default: 2000000\]
 **\--rga-cache-path=**\<path\>
 > Path to store cache db \[default: /home/phire/.cache/ripgrep-all\]
 **-h** shows a concise overview, **\--help** shows more detail and
 advanced options.
@ -287,6 +263,7 @@ to debug the adapters.
 You can use the provided [`flake.nix`](./flake.nix) to setup all build- and
 run-time dependencies:
 1. Enable [Flakes](https://nixos.wiki/wiki/Flakes) in your Nix configuration.
 1. Add [`direnv`](https://direnv.net/) to your profile:
   `nix profile install nixpkgs#direnv`
--- a/doc/config.default.jsonc
+++ b/doc/config.default.jsonc
@ -7,7 +7,7 @@
  // https://github.com/phiresky/ripgrep-all/blob/master/doc/config.default.jsonc
  // The config options are the same as the command line options,
-  // but with --rga- prefix removed and - replaced with _.
+  // but with --rga- prefix removed and - and . replaced with _.
  // e.g. --rga-no-cache becomes `"no_cache": true.
  // The only exception is the `custom_adapters` option, which can only be set in this file.
--- a/doc/update-readme.sh
+++ b/doc/update-readme.sh
@ -5,7 +5,7 @@ content=$(
 <!-- this part generated by update-readme.sh -->
 $(cargo run --bin rga -- --rga-list-adapters)
-$(help2man -N "cargo run --bin rga --" | pandoc -f man -t markdown --atx-headers | rg --multiline "## USAGE:(.|\n)*")
+$(help2man -N "cargo run --bin rga --" | pandoc -f man -t markdown --markdown-headings=atx | rg --multiline "## USAGE:(.|\n)*")
 <!-- end of part generated by update-readme.sh -->
 END
 )
--- a/exampledir/exif.png
+++ b/exampledir/exif.png
--- a/flake.lock
+++ b/flake.lock
@ -3,11 +3,11 @@
    "advisory-db": {
      "flake": false,
      "locked": {
-        "lastModified": 1670452192,
+        "lastModified": 1685821301,
-        "narHash": "sha256-f8NIFbqSbCzpssgDUK4srfgKaVaMhDScEptw4uuxGAc=",
+        "narHash": "sha256-4XRcnSboLJw1XKjDpg2jBU70jEw/8Bgx4nUmnq3kXbY=",
        "owner": "rustsec",
        "repo": "advisory-db",
-        "rev": "0a2faeb87195392b23333a8097309d29f2c5d31d",
+        "rev": "af3f3d503f82056785841bee49997bae65eba1c0",
        "type": "github"
      },
      "original": {
@ -26,11 +26,11 @@
        "rust-overlay": "rust-overlay"
      },
      "locked": {
-        "lastModified": 1670546681,
+        "lastModified": 1684981077,
-        "narHash": "sha256-S33bhME0zPHPEZyZPCsrdQL/4WW/A020PwN+a3z7Q+I=",
+        "narHash": "sha256-68X9cFm0RTZm8u0rXPbeBzOVUH5OoUGAfeHHVoxGd9o=",
        "owner": "ipetkov",
        "repo": "crane",
-        "rev": "63f80ee278897e72a1468090278716b5befa5128",
+        "rev": "35110cccf28823320f4fd697fcafcb5038683982",
        "type": "github"
      },
      "original": {
@ -42,11 +42,11 @@
    "flake-compat": {
      "flake": false,
      "locked": {
-        "lastModified": 1668681692,
+        "lastModified": 1673956053,
-        "narHash": "sha256-Ht91NGdewz8IQLtWZ9LCeNXMSXHUss+9COoqu6JLmXU=",
+        "narHash": "sha256-4gtG9iQuiKITOjNQQeQIpoIB6b16fm+504Ch3sNKLd8=",
        "owner": "edolstra",
        "repo": "flake-compat",
-        "rev": "009399224d5e398d03b22badca40a37ac85412a1",
+        "rev": "35bb57c0c8d8b62bbfd284272c928ceb64ddbde9",
        "type": "github"
      },
      "original": {
@ -58,11 +58,11 @@
    "flake-compat_2": {
      "flake": false,
      "locked": {
-        "lastModified": 1668681692,
+        "lastModified": 1673956053,
-        "narHash": "sha256-Ht91NGdewz8IQLtWZ9LCeNXMSXHUss+9COoqu6JLmXU=",
+        "narHash": "sha256-4gtG9iQuiKITOjNQQeQIpoIB6b16fm+504Ch3sNKLd8=",
        "owner": "edolstra",
        "repo": "flake-compat",
-        "rev": "009399224d5e398d03b22badca40a37ac85412a1",
+        "rev": "35bb57c0c8d8b62bbfd284272c928ceb64ddbde9",
        "type": "github"
      },
      "original": {
@ -72,12 +72,15 @@
      }
    },
    "flake-utils": {
      "inputs": {
        "systems": "systems"
      },
      "locked": {
-        "lastModified": 1667395993,
+        "lastModified": 1681202837,
-        "narHash": "sha256-nuEHfE/LcWyuSWnS8t12N1wc105Qtau+/OdUAjtQ0rA=",
+        "narHash": "sha256-H+Rh19JDwRtpVPAWp64F+rlEtxUWBAQW28eAi3SRSzg=",
        "owner": "numtide",
        "repo": "flake-utils",
-        "rev": "5aed5285a952e0b949eb3ba02c12fa4fcfef535f",
+        "rev": "cfacdce06f30d2b68473a46042957675eebb3401",
        "type": "github"
      },
      "original": {
@ -87,27 +90,15 @@
      }
    },
    "flake-utils_2": {
-      "locked": {
+      "inputs": {
-        "lastModified": 1667395993,
+        "systems": "systems_2"
        "narHash": "sha256-nuEHfE/LcWyuSWnS8t12N1wc105Qtau+/OdUAjtQ0rA=",
        "owner": "numtide",
        "repo": "flake-utils",
        "rev": "5aed5285a952e0b949eb3ba02c12fa4fcfef535f",
        "type": "github"
      },
      "original": {
        "owner": "numtide",
        "repo": "flake-utils",
        "type": "github"
      }
    },
    "flake-utils_3": {
      "locked": {
-        "lastModified": 1667395993,
+        "lastModified": 1685518550,
-        "narHash": "sha256-nuEHfE/LcWyuSWnS8t12N1wc105Qtau+/OdUAjtQ0rA=",
+        "narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=",
        "owner": "numtide",
        "repo": "flake-utils",
-        "rev": "5aed5285a952e0b949eb3ba02c12fa4fcfef535f",
+        "rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef",
        "type": "github"
      },
      "original": {
@ -139,48 +130,31 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1670525689,
+        "lastModified": 1685860998,
-        "narHash": "sha256-YIjGzxrRQa5LYO0zlnH/ardcwXsRgsnHe3TkGkvCxbc=",
+        "narHash": "sha256-ZexAPe8yvJaLvn5aVgjW0vY41RnmJGbgOdGBJk1yDIE=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "f21f11aa2a02cb78651c6d57546c7d7541f9240c",
+        "rev": "45d47b647d7bbaede5121d731cbee78f6093b6d6",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
        "ref": "nixpkgs-unstable",
        "repo": "nixpkgs",
        "type": "github"
      }
    },
    "nixpkgs-stable": {
      "locked": {
-        "lastModified": 1668984258,
+        "lastModified": 1678872516,
-        "narHash": "sha256-0gDMJ2T3qf58xgcSbYoXiRGUkPWmKyr5C3vcathWhKs=",
+        "narHash": "sha256-/E1YwtMtFAu2KUQKV/1+KFuReYPANM2Rzehk84VxVoc=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "cf63ade6f74bbc9d2a017290f1b2e33e8fbfa70a",
+        "rev": "9b8e5abb18324c7fe9f07cb100c3cd4a29cda8b8",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
-        "ref": "nixos-22.05",
+        "ref": "nixos-22.11",
        "repo": "nixpkgs",
        "type": "github"
      }
    },
    "nixpkgs_2": {
      "locked": {
        "lastModified": 1668994630,
        "narHash": "sha256-1lqx6HLyw6fMNX/hXrrETG1vMvZRGm2XVC9O/Jt0T6c=",
        "owner": "NixOS",
        "repo": "nixpkgs",
        "rev": "af50806f7c6ab40df3e6b239099e8f8385f6c78b",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
        "ref": "nixos-unstable",
        "repo": "nixpkgs",
        "type": "github"
      }
@ -188,17 +162,21 @@
    "pre-commit-hooks": {
      "inputs": {
        "flake-compat": "flake-compat_2",
-        "flake-utils": "flake-utils_3",
+        "flake-utils": [
          "flake-utils"
        ],
        "gitignore": "gitignore",
-        "nixpkgs": "nixpkgs_2",
+        "nixpkgs": [
          "nixpkgs"
        ],
        "nixpkgs-stable": "nixpkgs-stable"
      },
      "locked": {
-        "lastModified": 1670413394,
+        "lastModified": 1685361114,
-        "narHash": "sha256-M7sWqrKtOqUv9euX1t3HCxis8cPy9MNiZxQmUf0KF1o=",
+        "narHash": "sha256-4RjrlSb+OO+e1nzTExKW58o3WRwVGpXwj97iCta8aj4=",
        "owner": "cachix",
        "repo": "pre-commit-hooks.nix",
-        "rev": "1303a1a76e9eb074075bfe566518c413f6fc104e",
+        "rev": "ca2fdbf3edda2a38140184da6381d49f8206eaf4",
        "type": "github"
      },
      "original": {
@ -229,11 +207,11 @@
        ]
      },
      "locked": {
-        "lastModified": 1670034122,
+        "lastModified": 1683080331,
-        "narHash": "sha256-EqmuOKucPWtMvCZtHraHr3Q3bgVszq1x2PoZtQkUuEk=",
+        "narHash": "sha256-nGDvJ1DAxZIwdn6ww8IFwzoHb2rqBP4wv/65Wt5vflk=",
        "owner": "oxalica",
        "repo": "rust-overlay",
-        "rev": "a0d5773275ecd4f141d792d3a0376277c0fc0b65",
+        "rev": "d59c3fa0cba8336e115b376c2d9e91053aa59e56",
        "type": "github"
      },
      "original": {
@ -252,11 +230,11 @@
        ]
      },
      "locked": {
-        "lastModified": 1670552927,
+        "lastModified": 1685846256,
-        "narHash": "sha256-lCE51eAGrAFS4k9W5aDGFpVtOAwQQ/rFMN80PCDh0vo=",
+        "narHash": "sha256-G4aYK4VqlMHImvZ0lUnLHw1A+Cx28T0sBMvAKZBcGpk=",
        "owner": "oxalica",
        "repo": "rust-overlay",
-        "rev": "a0fdafd18c9cf599fde17fbaf07dbb20fa57eecb",
+        "rev": "1ef3c6de6127a1cba94cc5492cdde52e33d06ea4",
        "type": "github"
      },
      "original": {
@ -264,6 +242,36 @@
        "repo": "rust-overlay",
        "type": "github"
      }
    },
    "systems": {
      "locked": {
        "lastModified": 1681028828,
        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
        "owner": "nix-systems",
        "repo": "default",
        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
        "type": "github"
      },
      "original": {
        "owner": "nix-systems",
        "repo": "default",
        "type": "github"
      }
    },
    "systems_2": {
      "locked": {
        "lastModified": 1681028828,
        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
        "owner": "nix-systems",
        "repo": "default",
        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
        "type": "github"
      },
      "original": {
        "owner": "nix-systems",
        "repo": "default",
        "type": "github"
      }
    }
  },
  "root": "root",
--- a/flake.nix
+++ b/flake.nix
@ -3,7 +3,7 @@
    "ripgrep, but also search in PDFs, E-Books, Office documents, zip, tar.gz, etc.";
  inputs = {
-    nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
+    nixpkgs.url = "github:NixOS/nixpkgs";
    crane = {
      url = "github:ipetkov/crane";
@ -25,7 +25,13 @@
      flake = false;
    };
-    pre-commit-hooks.url = "github:cachix/pre-commit-hooks.nix";
+    pre-commit-hooks = {
      url = "github:cachix/pre-commit-hooks.nix";
      inputs = {
        nixpkgs.follows = "nixpkgs";
        flake-utils.follows = "flake-utils";
      };
    };
  };
  outputs = { self, nixpkgs, crane, flake-utils, rust-overlay, advisory-db
@ -36,14 +42,16 @@
          inherit system;
          overlays = [ (import rust-overlay) ];
        };
        inherit (pkgs) lib;
        craneLib = crane.lib.${system};
-        src = craneLib.cleanCargoSource ./.;
+        src = pkgs.lib.cleanSourceWith {
          src = craneLib.path ./.;
          filter = pkgs.lib.cleanSourceFilter;
        };
        buildInputs = with pkgs;
          [ ffmpeg imagemagick pandoc poppler_utils ripgrep tesseract ]
-          ++ lib.optionals pkgs.stdenv.isDarwin [
+          ++ pkgs.lib.optionals pkgs.stdenv.isDarwin [
            # Additional darwin specific inputs can be set here
            pkgs.libiconv
          ];
@ -54,10 +62,7 @@
        # Build the actual crate itself, reusing the dependency
        # artifacts from above.
-        rga = craneLib.buildPackage {
+        rga = craneLib.buildPackage { inherit cargoArtifacts src buildInputs; };
          inherit cargoArtifacts src buildInputs;
          doCheck = false;
        };
        pre-commit = pre-commit-hooks.lib."${system}".run;
      in {
@ -97,18 +102,20 @@
            hooks = {
              nixfmt.enable = true;
              rustfmt.enable = true;
-              cargo-check.enable = true;
+              typos = {
                enable = true;
                types = [ "text" ];
                excludes = [ "exampledir/.*" ];
              };
            };
          };
        } // lib.optionalAttrs (system == "x86_64-linux") {
          # NB: cargo-tarpaulin only supports x86_64 systems
          # Check code coverage (note: this will not upload coverage anywhere)
          rga-coverage =
            craneLib.cargoTarpaulin { inherit cargoArtifacts src; };
        };
        # `nix build`
-        packages.default = rga;
+        packages = {
          inherit rga; # `nix build .#rga`
          default = rga; # `nix build`
        };
        # `nix run`
        apps.default = flake-utils.lib.mkApp { drv = rga; };
--- a/src/adapters.rs
+++ b/src/adapters.rs
@ -10,6 +10,7 @@ pub mod writing;
 pub mod zip;
 use crate::{adapted_iter::AdaptedFilesIterBox, config::RgaConfig, matching::*};
 use anyhow::{format_err, Context, Result};
 use async_trait::async_trait;
 use custom::CustomAdapterConfig;
 use custom::BUILTIN_SPAWNING_ADAPTERS;
 use log::*;
@ -77,11 +78,17 @@ impl AdapterMeta {
 pub trait GetMetadata {
    fn metadata(&self) -> &AdapterMeta;
 }
 #[async_trait]
 pub trait FileAdapter: GetMetadata + Send + Sync {
    /// adapt a file.
    ///
    /// detection_reason is the Matcher that was used to identify this file. Unless --rga-accurate was given, it is always a FastMatcher
-    fn adapt(&self, a: AdaptInfo, detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox>;
+    async fn adapt(
        &self,
        a: AdaptInfo,
        detection_reason: &FileMatcher,
    ) -> Result<AdaptedFilesIterBox>;
 }
 pub struct AdaptInfo {
--- a/src/adapters/custom.rs
+++ b/src/adapters/custom.rs
@ -49,8 +49,9 @@ pub struct CustomAdapterConfig {
    pub args: Vec<String>,
    /// The output path hint. The placeholders are the same as for `.args`
    ///
-    /// If not set, defaults to ${input_virtual_path}.txt
+    /// If not set, defaults to "${input_virtual_path}.txt"
    ///
    /// Setting this is useful if the output format is not plain text (.txt) but instead some other format that should be passed to another adapter
    pub output_path_hint: Option<String>,
 }
@ -128,7 +129,6 @@ lazy_static! {
            disabled_by_default: None,
            match_only_by_mime: None,
            output_path_hint: Some("${input_virtual_path}.txt.asciipagebreaks".into())
            // postprocessors: [{name: "add_page_numbers_by_pagebreaks"}]
        }
    ];
 }
@ -143,15 +143,13 @@ pub fn map_exe_error(err: std::io::Error, exe_name: &str, help: &str) -> anyhow:
    }
 }
-fn proc_wait(mut child: Child) -> impl AsyncRead {
+fn proc_wait(mut child: Child, context: impl FnOnce() -> String) -> impl AsyncRead {
    let s = stream! {
        let res = child.wait().await?;
        if res.success() {
            yield std::io::Result::Ok(Bytes::new());
        } else {
-            yield std::io::Result::Err(to_io_err(
+            Err(format_err!("{:?}", res)).with_context(context).map_err(to_io_err)?;
                format_err!("subprocess failed: {:?}", res),
            ));
        }
    };
    StreamReader::new(s)
@ -164,6 +162,7 @@ pub fn pipe_output(
    exe_name: &str,
    help: &str,
 ) -> Result<ReadBox> {
    let cmd_log = format!("{:?}", cmd); // todo: perf
    let mut cmd = cmd
        .stdin(Stdio::piped())
        .stdout(Stdio::piped())
@ -177,10 +176,9 @@ pub fn pipe_output(
        tokio::io::copy(&mut z, &mut stdi).await?;
        std::io::Result::Ok(())
    });
-
+    Ok(Box::pin(stdo.chain(
-    Ok(Box::pin(
+        proc_wait(cmd, move || format!("subprocess: {cmd_log}")).chain(join_handle_to_stream(join)),
-        stdo.chain(proc_wait(cmd).chain(join_handle_to_stream(join))),
+    )))
    ))
 }
 pub struct CustomSpawningFileAdapter {
@ -224,8 +222,9 @@ impl CustomSpawningFileAdapter {
        Ok(command)
    }
 }
 #[async_trait]
 impl FileAdapter for CustomSpawningFileAdapter {
-    fn adapt<'a>(
+    async fn adapt(
        &self,
        ai: AdaptInfo,
        _detection_reason: &FileMatcher,
@ -314,7 +313,7 @@ mod test {
        let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
        // let r = adapter.adapt(a, &d)?;
-        let r = loop_adapt(&adapter, d, a)?;
+        let r = loop_adapt(&adapter, d, a).await?;
        let o = adapted_to_vec(r).await?;
        assert_eq!(
            String::from_utf8(o)?,
@ -368,7 +367,7 @@ PREFIX:Page 1:
            Path::new("foo.txt"),
            Box::pin(Cursor::new(Vec::from(input))),
        );
-        let output = adapter.adapt(a, &d).unwrap();
+        let output = adapter.adapt(a, &d).await.unwrap();
        let oup = adapted_to_vec(output).await?;
        println!("output: {}", String::from_utf8_lossy(&oup));
--- a/src/adapters/decompress.rs
+++ b/src/adapters/decompress.rs
@ -93,8 +93,13 @@ fn get_inner_filename(filename: &Path) -> PathBuf {
    filename.with_file_name(format!("{}{}", stem, new_extension))
 }
 #[async_trait]
 impl FileAdapter for DecompressAdapter {
-    fn adapt(&self, ai: AdaptInfo, detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox> {
+    async fn adapt(
        &self,
        ai: AdaptInfo,
        detection_reason: &FileMatcher,
    ) -> Result<AdaptedFilesIterBox> {
        Ok(one_file(AdaptInfo {
            filepath_hint: get_inner_filename(&ai.filepath_hint),
            is_real_file: false,
@ -137,7 +142,7 @@ mod tests {
        let filepath = test_data_dir().join("hello.gz");
        let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
-        let r = adapter.adapt(a, &d)?;
+        let r = adapter.adapt(a, &d).await?;
        let o = adapted_to_vec(r).await?;
        assert_eq!(String::from_utf8(o)?, "hello\n");
        Ok(())
@ -150,7 +155,7 @@ mod tests {
        let filepath = test_data_dir().join("short.pdf.gz");
        let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
-        let r = loop_adapt(&adapter, d, a)?;
+        let r = loop_adapt(&adapter, d, a).await?;
        let o = adapted_to_vec(r).await?;
        assert_eq!(
            String::from_utf8(o)?,
--- a/src/adapters/ffmpeg.rs
+++ b/src/adapters/ffmpeg.rs
@ -14,13 +14,15 @@ use writing::WritingFileAdapter;
 // maybe todo: read list of extensions from
 // ffmpeg -demuxers | tail -n+5 | awk '{print $2}' | while read demuxer; do echo MUX=$demuxer; ffmpeg -h demuxer=$demuxer | grep 'Common extensions'; done 2>/dev/null
 // but really, the probability of getting useful information from a .flv is low
-static EXTENSIONS: &[&str] = &["mkv", "mp4", "avi"];
+static EXTENSIONS: &[&str] = &["mkv", "mp4", "avi", "mp3", "ogg", "flac", "webm"];
 lazy_static! {
    static ref METADATA: AdapterMeta = AdapterMeta {
        name: "ffmpeg".to_owned(),
        version: 1,
-        description: "Uses ffmpeg to extract video metadata/chapters and subtitles".to_owned(),
+        description:
            "Uses ffmpeg to extract video metadata/chapters, subtitles, lyrics, and other metadata"
                .to_owned(),
        recurses: false,
        fast_matchers: EXTENSIONS
            .iter()
@ -52,7 +54,7 @@ struct FFprobeOutput {
 }
 #[derive(Serialize, Deserialize)]
 struct FFprobeStream {
-    codec_type: String, // video,audio,subtitle
+    index: i32, // stream index
 }
 #[async_trait]
@ -78,17 +80,17 @@ impl WritingFileAdapter for FFmpegAdapter {
        }
        let inp_fname = filepath_hint;
        let spawn_fail = |e| map_exe_error(e, "ffprobe", "Make sure you have ffmpeg installed.");
-        let has_subtitles = {
+        let subtitle_streams = {
            let probe = Command::new("ffprobe")
                .args(vec![
                    "-v",
-                    "error",
+                    "error", // show all errors
                    "-select_streams",
-                    "s",
+                    "s", // show only subtitle streams
                    "-of",
-                    "json",
+                    "json", // use json as output format
                    "-show_entries",
-                    "stream=codec_type",
+                    "stream=index", // show index of subtitle streams
                ])
                .arg("-i")
                .arg(&inp_fname)
@ -96,10 +98,14 @@ impl WritingFileAdapter for FFmpegAdapter {
                .await
                .map_err(spawn_fail)?;
            if !probe.status.success() {
-                return Err(format_err!("ffprobe failed: {:?}", probe.status));
+                return Err(format_err!(
                    "ffprobe failed: {:?}\n{}",
                    probe.status,
                    String::from_utf8_lossy(&probe.stderr)
                ));
            }
            let p: FFprobeOutput = serde_json::from_slice(&probe.stdout)?;
-            !p.streams.is_empty()
+            p.streams
        };
        {
            // extract file metadata (especially chapter names in a greppable format)
@ -124,6 +130,7 @@ impl WritingFileAdapter for FFmpegAdapter {
                .spawn()?;
            let mut lines = BufReader::new(probe.stdout.as_mut().unwrap()).lines();
            while let Some(line) = lines.next_line().await? {
                let line = line.replace("\\r\\n", "\n").replace("\\n", "\n"); // just unescape newlines
                async_writeln!(oup, "metadata: {line}")?;
            }
            let exit = probe.wait().await?;
@ -131,7 +138,8 @@ impl WritingFileAdapter for FFmpegAdapter {
                return Err(format_err!("ffprobe failed: {:?}", exit));
            }
        }
-        if has_subtitles {
+        if !subtitle_streams.is_empty() {
            for probe_stream in subtitle_streams.iter() {
                // extract subtitles
                let mut cmd = Command::new("ffmpeg");
                cmd.arg("-hide_banner")
@ -139,6 +147,8 @@ impl WritingFileAdapter for FFmpegAdapter {
                    .arg("panic")
                    .arg("-i")
                    .arg(&inp_fname)
                    .arg("-map")
                    .arg(format!("0:{}", probe_stream.index)) // 0 for first input
                    .arg("-f")
                    .arg("webvtt")
                    .arg("-");
@ -159,6 +169,7 @@ impl WritingFileAdapter for FFmpegAdapter {
                    }
                }
            }
        }
        Ok(())
    }
 }
--- a/src/adapters/postproc.rs
+++ b/src/adapters/postproc.rs
@ -4,7 +4,11 @@
 use anyhow::Result;
 use async_stream::stream;
 use async_trait::async_trait;
 use bytes::Bytes;
 use encoding_rs::Encoding;
 use encoding_rs_io::DecodeReaderBytesBuilder;
 use tokio_util::io::SyncIoBridge;
 use std::io::Cursor;
 use std::path::PathBuf;
@ -41,15 +45,16 @@ impl GetMetadata for PostprocPrefix {
        &METADATA
    }
 }
 #[async_trait]
 impl FileAdapter for PostprocPrefix {
-    fn adapt<'a>(
+    async fn adapt(
        &self,
        a: super::AdaptInfo,
        _detection_reason: &crate::matching::FileMatcher,
    ) -> Result<AdaptedFilesIterBox> {
        let read = add_newline(postproc_prefix(
            &a.line_prefix,
-            postproc_encoding(&a.line_prefix, a.inp)?,
+            postproc_encoding(&a.line_prefix, a.inp).await?,
        ));
        // keep adapt info (filename etc) except replace inp
        let ai = AdaptInfo {
@ -74,50 +79,53 @@ impl Read for ReadErr {
 * Detects and converts encodings other than utf-8 to utf-8.
 * If the input stream does not contain valid text, returns the string `[rga: binary data]` instead
 */
-pub fn postproc_encoding(
+async fn postproc_encoding(
    _line_prefix: &str,
-    inp: impl AsyncRead + Send + 'static,
+    inp: Pin<Box<dyn AsyncRead + Send>>,
 ) -> Result<Pin<Box<dyn AsyncRead + Send>>> {
-    Ok(Box::pin(inp))
+    // check for binary content in first 8kB
-    // panic!("todo: implement");
+    // read the first 8kB into a buffer, check for null bytes, then return the buffer concatenated with the rest of the file
-    /*// TODO: parse these options from ripgrep's configuration
+    let mut fourk = Vec::with_capacity(1 << 13);
    let mut beginning = inp.take(1 << 13);
    beginning.read_to_end(&mut fourk).await?;
    let has_binary = fourk.contains(&0u8);
    let enc = Encoding::for_bom(&fourk);
    let inp = Cursor::new(fourk).chain(beginning.into_inner());
    match enc {
        Some((enc, _)) if enc != encoding_rs::UTF_8 => {
            // detected UTF16LE or UTF16BE, convert to UTF8 in separate thread
            // TODO: parse these options from ripgrep's configuration
            let encoding = None; // detect bom but usually assume utf8
            let bom_sniffing = true;
            let mut decode_builder = DecodeReaderBytesBuilder::new();
            // https://github.com/BurntSushi/ripgrep/blob/a7d26c8f144a4957b75f71087a66692d0b25759a/grep-searcher/src/searcher/mod.rs#L706
            // this detects utf-16 BOMs and transcodes to utf-8 if they are present
            // it does not detect any other char encodings. that would require https://github.com/hsivonen/chardetng or similar but then binary detection is hard (?)
-    let inp = decode_builder
+            let mut inp = decode_builder
                .encoding(encoding)
                .utf8_passthru(true)
                .strip_bom(bom_sniffing)
                .bom_override(true)
                .bom_sniffing(bom_sniffing)
-        .build(inp);
+                .build(SyncIoBridge::new(inp));
-
+            let oup = tokio::task::spawn_blocking(move || -> Result<Vec<u8>> {
-    // check for binary content in first 8kB
+                let mut oup = Vec::new();
-    // read the first 8kB into a buffer, check for null bytes, then return the buffer concatenated with the rest of the file
+                std::io::Read::read_to_end(&mut inp, &mut oup)?;
-    let mut fourk = Vec::with_capacity(1 << 13);
+                Ok(oup)
-    let mut beginning = inp.take(1 << 13);
+            })
-
+            .await??;
-    beginning.read_to_end(&mut fourk)?;
+            Ok(Box::pin(Cursor::new(oup)))
-
+        }
-    if fourk.contains(&0u8) {
+        _ => {
-        log::debug!("detected binary");
+            if has_binary {
-        let v = "[rga: binary data]";
+                log::debug!("detected binary");
-        return Ok(Box::new(std::io::Cursor::new(v)));
+                return Ok(Box::pin(Cursor::new("[rga: binary data]")));
-        /*let err = std::io::Error::new(
+            }
-            std::io::ErrorKind::InvalidData,
+            Ok(Box::pin(inp))
-            format!("{}[rga: binary data]", line_prefix),
+        }
        );
        return Err(err).context("");
        return ReadErr {
            err,
        };*/
    }
    Ok(Box::new(
        std::io::Cursor::new(fourk).chain(beginning.into_inner()),
    ))*/
 }
 /// Adds the given prefix to each line in an `AsyncRead`.
@ -164,13 +172,14 @@ impl GetMetadata for PostprocPageBreaks {
        &METADATA
    }
 }
 #[async_trait]
 impl FileAdapter for PostprocPageBreaks {
-    fn adapt<'a>(
+    async fn adapt(
        &self,
        a: super::AdaptInfo,
        _detection_reason: &crate::matching::FileMatcher,
    ) -> Result<AdaptedFilesIterBox> {
-        let read = postproc_pagebreaks(postproc_encoding(&a.line_prefix, a.inp)?);
+        let read = postproc_pagebreaks(postproc_encoding(&a.line_prefix, a.inp).await?);
        // keep adapt info (filename etc) except replace inp
        let ai = AdaptInfo {
            inp: Box::pin(read),
@ -282,7 +291,7 @@ mod tests {
        let fname = test_data_dir().join("twoblankpages.pdf");
        let rd = File::open(&fname).await?;
        let (a, d) = simple_adapt_info(&fname, Box::pin(rd));
-        let res = loop_adapt(&adapter, d, a)?;
+        let res = loop_adapt(&adapter, d, a).await?;
        let buf = adapted_to_vec(res).await?;
@ -327,7 +336,8 @@ PREFIX:Page 3:
        b: &str,
    ) -> Result<()> {
        let mut oup = Vec::new();
-        let inp = postproc_encoding("", a)?;
+        let inp = Box::pin(Cursor::new(a));
        let inp = postproc_encoding("", inp).await?;
        if pagebreaks {
            postproc_pagebreaks(inp).read_to_end(&mut oup).await?;
        } else {
@ -341,6 +351,23 @@ PREFIX:Page 3:
        Ok(())
    }
    #[tokio::test]
    async fn test_utf16() -> Result<()> {
        let utf16lebom: &[u8] = &[
            0xff, 0xfe, 0x68, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x20, 0x00,
            0x77, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x6c, 0x00, 0x64, 0x00, 0x20, 0x00, 0x3d, 0xd8,
            0xa9, 0xdc, 0x0a, 0x00,
        ];
        let utf16bebom: &[u8] = &[
            0xfe, 0xff, 0x00, 0x68, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x20,
            0x00, 0x77, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x6c, 0x00, 0x64, 0x00, 0x20, 0xd8, 0x3d,
            0xdc, 0xa9, 0x00, 0x0a,
        ];
        test_from_bytes(false, "", utf16lebom, "hello world 💩\n").await?;
        test_from_bytes(false, "", utf16bebom, "hello world 💩\n").await?;
        Ok(())
    }
    #[tokio::test]
    async fn post1() -> Result<()> {
        let inp = "What is this\nThis is a test\nFoo";
@ -362,8 +389,7 @@ PREFIX:Page 3:
        Ok(())
    }
-    /*
+
    todo: uncomment when fixed
    #[tokio::test]
    async fn test_binary_content() -> Result<()> {
        test_from_strs(
@ -375,7 +401,7 @@ PREFIX:Page 3:
        .await?;
        test_from_strs(false, "foo:", "\0", "foo:[rga: binary data]").await?;
        Ok(())
-     }*/
+    }
    /*#[test]
    fn chardet() -> Result<()> {
--- a/src/adapters/sqlite.rs
+++ b/src/adapters/sqlite.rs
@ -77,11 +77,13 @@ fn synchronous_dump_sqlite(ai: AdaptInfo, mut s: impl Write) -> Result<()> {
        return Ok(());
    }
    let inp_fname = filepath_hint;
-
+    let conn = Connection::open_with_flags(&inp_fname, OpenFlags::SQLITE_OPEN_READ_ONLY)
-    let conn = Connection::open_with_flags(inp_fname, OpenFlags::SQLITE_OPEN_READ_ONLY)?;
+        .with_context(|| format!("opening sqlite connection to {}", inp_fname.display()))?;
    let tables: Vec<String> = conn
-        .prepare("select name from sqlite_master where type='table'")?
+        .prepare("select name from sqlite_master where type='table'")
-        .query_map([], |r| r.get::<_, String>(0))?
+        .context("while preparing query")?
        .query_map([], |r| r.get::<_, String>(0))
        .context("while executing query")?
        .filter_map(|e| e.ok())
        .collect();
    debug!("db has {} tables", tables.len());
@ -121,7 +123,9 @@ impl WritingFileAdapter for SqliteAdapter {
        oup: Pin<Box<dyn AsyncWrite + Send>>,
    ) -> Result<()> {
        let oup_sync = SyncIoBridge::new(oup);
-        tokio::task::spawn_blocking(|| synchronous_dump_sqlite(ai, oup_sync)).await??;
+        tokio::task::spawn_blocking(|| synchronous_dump_sqlite(ai, oup_sync))
            .await?
            .context("in synchronous sqlite task")?;
        Ok(())
    }
 }
@ -134,10 +138,10 @@ mod test {
    #[tokio::test]
    async fn simple() -> Result<()> {
-        let adapter: Box<dyn FileAdapter> = Box::new(SqliteAdapter::default());
+        let adapter: Box<dyn FileAdapter> = Box::<SqliteAdapter>::default();
        let fname = test_data_dir().join("hello.sqlite3");
        let (a, d) = simple_fs_adapt_info(&fname).await?;
-        let res = adapter.adapt(a, &d)?;
+        let res = adapter.adapt(a, &d).await?;
        let buf = adapted_to_vec(res).await?;
--- a/src/adapters/tar.rs
+++ b/src/adapters/tar.rs
@ -6,6 +6,7 @@ use crate::{
 };
 use anyhow::*;
 use async_stream::stream;
 use async_trait::async_trait;
 use lazy_static::lazy_static;
 use log::*;
 use std::path::PathBuf;
@ -45,8 +46,13 @@ impl GetMetadata for TarAdapter {
    }
 }
 #[async_trait]
 impl FileAdapter for TarAdapter {
-    fn adapt(&self, ai: AdaptInfo, _detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox> {
+    async fn adapt(
        &self,
        ai: AdaptInfo,
        _detection_reason: &FileMatcher,
    ) -> Result<AdaptedFilesIterBox> {
        let AdaptInfo {
            filepath_hint,
            inp,
@ -103,7 +109,7 @@ mod tests {
        let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
        let adapter = TarAdapter::new();
-        let r = loop_adapt(&adapter, d, a).context("adapt")?;
+        let r = loop_adapt(&adapter, d, a).await.context("adapt")?;
        let o = adapted_to_vec(r).await.context("adapted_to_vec")?;
        assert_eq!(
            String::from_utf8(o).context("parsing utf8")?,
--- a/src/adapters/writing.rs
+++ b/src/adapters/writing.rs
@ -3,7 +3,7 @@ use std::pin::Pin;
 use crate::{adapted_iter::one_file, join_handle_to_stream, to_io_err};
 use super::{AdaptInfo, FileAdapter, GetMetadata};
-use anyhow::Result;
+use anyhow::{Context, Result};
 use async_trait::async_trait;
 use tokio::io::{AsyncReadExt, AsyncWrite};
@ -41,15 +41,17 @@ macro_rules! async_writeln {
 }
 pub(crate) use async_writeln;
 #[async_trait]
 impl<T> FileAdapter for T
 where
    T: WritingFileAdapter,
 {
-    fn adapt(
+    async fn adapt(
        &self,
        a: super::AdaptInfo,
        detection_reason: &crate::matching::FileMatcher,
    ) -> Result<crate::adapted_iter::AdaptedFilesIterBox> {
        let name = self.metadata().name.clone();
        let (w, r) = tokio::io::duplex(128 * 1024);
        let d2 = detection_reason.clone();
        let archive_recursion_depth = a.archive_recursion_depth + 1;
@ -59,7 +61,10 @@ where
        let config = a.config.clone();
        let joiner = tokio::spawn(async move {
            let x = d2;
-            T::adapt_write(a, &x, Box::pin(w)).await.map_err(to_io_err)
+            T::adapt_write(a, &x, Box::pin(w))
                .await
                .with_context(|| format!("in {}.adapt_write", name))
                .map_err(to_io_err)
        });
        Ok(one_file(AdaptInfo {
--- a/src/adapters/zip.rs
+++ b/src/adapters/zip.rs
@ -5,7 +5,7 @@ use async_stream::stream;
 use lazy_static::lazy_static;
 use log::*;
-static EXTENSIONS: &[&str] = &["zip"];
+static EXTENSIONS: &[&str] = &["zip", "jar"];
 lazy_static! {
    static ref METADATA: AdapterMeta = AdapterMeta {
@ -36,8 +36,13 @@ impl GetMetadata for ZipAdapter {
    }
 }
 #[async_trait]
 impl FileAdapter for ZipAdapter {
-    fn adapt(&self, ai: AdaptInfo, _detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox> {
+    async fn adapt(
        &self,
        ai: AdaptInfo,
        _detection_reason: &FileMatcher,
    ) -> Result<AdaptedFilesIterBox> {
        // let (s, r) = mpsc::channel(1);
        let AdaptInfo {
            inp,
@ -52,11 +57,11 @@ impl FileAdapter for ZipAdapter {
        if is_real_file {
            use async_zip::read::fs::ZipFileReader;
            let s = stream! {
            let zip = ZipFileReader::new(&filepath_hint).await?;
-                for i in 0..zip.entries().len() {
+            let s = stream! {
-                    let reader = zip.entry_reader(i).await?;
+                for i in 0..zip.file().entries().len() {
-                    let file = reader.entry();
+                    let file = zip.get_entry(i)?;
                    let reader = zip.entry(i).await?;
                    if file.filename().ends_with('/') {
                        continue;
                    }
@ -98,10 +103,11 @@ impl FileAdapter for ZipAdapter {
            let mut zip = ZipFileReader::new(inp);
            let s = stream! {
-                    while !zip.finished() {
+                    while let Some(mut entry) = zip.next_entry().await? {
-                    if let Some(reader) = zip.entry_reader().await? {
+                        let file = entry.entry();
                        let file = reader.entry();
                        if file.filename().ends_with('/') {
                            zip = entry.skip().await?;
                            continue;
                        }
                        debug!(
@ -114,6 +120,7 @@ impl FileAdapter for ZipAdapter {
                        );
                        let new_line_prefix = format!("{}{}: ", line_prefix, file.filename());
                        let fname = PathBuf::from(file.filename());
                        let reader = entry.reader();
                        tokio::pin!(reader);
                        // SAFETY: this should be solvable without unsafe but idk how :(
                        // the issue is that ZipEntryReader borrows from ZipFileReader, but we need to yield it here into the stream
@ -133,7 +140,8 @@ impl FileAdapter for ZipAdapter {
                            postprocess,
                            config: config.clone(),
                        });
-                    }
+                        zip = entry.done().await.context("going to next file in zip but entry was not read fully")?;
                }
            };
@ -183,7 +191,6 @@ impl<'a> AdaptedFilesIter for ZipAdaptIter<'a> {
 mod test {
    use async_zip::{write::ZipFileWriter, Compression, ZipEntryBuilder};
    use super::*;
    use crate::{preproc::loop_adapt, test_utils::*};
    use pretty_assertions::assert_eq;
@ -213,7 +220,7 @@ mod test {
    async fn only_seek_zip_fs() -> Result<()> {
        let zip = test_data_dir().join("only-seek-zip.zip");
        let (a, d) = simple_fs_adapt_info(&zip).await?;
-        let _v = adapted_to_vec(loop_adapt(&ZipAdapter::new(), d, a)?).await?;
+        let _v = adapted_to_vec(loop_adapt(&ZipAdapter::new(), d, a).await?).await?;
        // assert_eq!(String::from_utf8(v)?, "");
        Ok(())
@ -236,7 +243,7 @@ mod test {
            &PathBuf::from("outer.zip"),
            Box::pin(std::io::Cursor::new(zipfile)),
        );
-        let buf = adapted_to_vec(loop_adapt(&adapter, d, a)?).await?;
+        let buf = adapted_to_vec(loop_adapt(&adapter, d, a).await?).await?;
        assert_eq!(
            String::from_utf8(buf)?,
--- a/src/bin/rga-preproc.rs
+++ b/src/bin/rga-preproc.rs
@ -43,7 +43,7 @@ async fn main() -> anyhow::Result<()> {
            // happens if e.g. ripgrep detects binary data in the pipe so it cancels reading
            debug!("output cancelled (broken pipe)");
        } else {
-            Err(e).context("copying adapter output to stdout {}")?;
+            Err(e).context("copying adapter output to stdout")?;
        }
    }
    debug!("running adapter took {} total", print_dur(start));
--- a/src/caching_writer.rs
+++ b/src/caching_writer.rs
@ -1,17 +1,17 @@
-use std::pin::Pin;
+use std::{future::Future, pin::Pin};
-use anyhow::Result;
+use anyhow::{Context, Result};
 use async_compression::tokio::write::ZstdEncoder;
 use async_stream::stream;
 use crate::to_io_err;
 use log::*;
 use tokio::io::{AsyncRead, AsyncWriteExt};
 use tokio_stream::StreamExt;
 use tokio_util::io::{ReaderStream, StreamReader};
-use crate::to_io_err;
+type FinishHandler =
-
+    dyn FnOnce((u64, Option<Vec<u8>>)) -> Pin<Box<dyn Future<Output = Result<()>> + Send>> + Send;
 type FinishHandler = dyn FnOnce((u64, Option<Vec<u8>>)) -> Result<()> + Send;
 /**
 * wrap a AsyncRead so that it is passthrough,
 * but also the written data is compressed and written into a buffer,
@ -26,7 +26,7 @@ pub fn async_read_and_write_to_cache<'a>(
    let inp = Box::pin(inp);
    let mut zstd_writer = Some(ZstdEncoder::with_quality(
        Vec::new(),
-        async_compression::Level::Precise(compression_level as u32),
+        async_compression::Level::Precise(compression_level),
    ));
    let mut bytes_written = 0;
@ -64,7 +64,7 @@ pub fn async_read_and_write_to_cache<'a>(
        };
        // EOF, finish!
-        on_finish(finish)
+        on_finish(finish).await.context("write_to_cache on_finish")
            .map_err(to_io_err)?;
    };
--- a/src/config.rs
+++ b/src/config.rs
@ -108,6 +108,7 @@ impl FromStr for CacheMaxBlobLen {
    rename_all = "kebab-case",
    about = env!("CARGO_PKG_DESCRIPTION"),
    author = env!("CARGO_PKG_HOMEPAGE"),
    long_about="rga: ripgrep, but also search in PDFs, E-Books, Office documents, zip, tar.gz, etc.",
    // TODO: long_about does not seem to work to only show this on short help
    after_help = "-h shows a concise overview, --help shows more detail and advanced options.\n\nAll other options not shown here are passed directly to rg, especially [PATTERN] and [PATH ...]",
    usage = "rga [RGA OPTIONS] [RG OPTIONS] PATTERN [PATH ...]"
@ -197,9 +198,9 @@ pub struct CacheConfig {
    /// Disable caching of results
    ///
    /// By default, rga caches the extracted text, if it is small enough,
-    /// to a database in ~/.cache/rga on Linux,
+    /// to a database in ${XDG_CACHE_DIR-~/.cache}/ripgrep-all on Linux,
-    /// ~/Library/Caches/rga on macOS,
+    /// ~/Library/Caches/ripgrep-all on macOS,
-    /// or C:\Users\username\AppData\Local\rga on Windows.
+    /// or C:\Users\username\AppData\Local\ripgrep-all on Windows.
    /// This way, repeated searches on the same set of files will be much faster.
    /// If you pass this flag, all caching will be disabled.
    #[serde(default, skip_serializing_if = "is_default")]
@ -208,7 +209,9 @@ pub struct CacheConfig {
    /// Max compressed size to cache
    ///
-    /// Longest byte length (after compression) to store in cache. Longer adapter outputs will not be cached and recomputed every time. Allowed suffixes: k M G
+    /// Longest byte length (after compression) to store in cache. Longer adapter outputs will not be cached and recomputed every time.
    ///
    /// Allowed suffixes on command line: k M G
    #[serde(default, skip_serializing_if = "is_default")]
    #[structopt(
        default_value,
--- a/src/preproc.rs
+++ b/src/preproc.rs
@ -3,25 +3,28 @@ use crate::adapters::*;
 use crate::caching_writer::async_read_and_write_to_cache;
 use crate::config::RgaConfig;
 use crate::matching::*;
 use crate::preproc_cache::CacheKey;
 use crate::recurse::concat_read_streams;
 use crate::{
-    preproc_cache::{LmdbCache, PreprocCache},
+    preproc_cache::{open_cache_db, PreprocCache},
    print_bytes,
 };
 use anyhow::*;
 use async_compression::tokio::bufread::ZstdDecoder;
 use async_stream::stream;
 // use futures::future::{BoxFuture, FutureExt};
 use log::*;
 use path_clean::PathClean;
 use postproc::PostprocPrefix;
 use std::future::Future;
 use std::io::Cursor;
 use std::path::Path;
 use std::pin::Pin;
 use std::sync::Arc;
 use tokio::io::AsyncBufRead;
 use tokio::io::AsyncBufReadExt;
 use tokio::io::BufReader;
-type ActiveAdapters = Vec<Arc<dyn FileAdapter>>;
+pub type ActiveAdapters = Vec<Arc<dyn FileAdapter>>;
 async fn choose_adapter(
    config: &RgaConfig,
@ -120,36 +123,6 @@ pub async fn rga_preproc(ai: AdaptInfo) -> Result<ReadBox> {
        .with_context(|| format!("run_adapter({})", &path_hint_copy.to_string_lossy()))
 }
 fn compute_cache_key(
    filepath_hint: &Path,
    adapter: &dyn FileAdapter,
    active_adapters: ActiveAdapters,
 ) -> Result<Vec<u8>> {
    let clean_path = filepath_hint.to_owned().clean();
    let meta = std::fs::metadata(filepath_hint)
        .with_context(|| format!("reading metadata for {}", filepath_hint.to_string_lossy()))?;
    let modified = meta.modified().expect("weird OS that can't into mtime");
    if adapter.metadata().recurses {
        let active_adapters_cache_key = active_adapters
            .iter()
            .map(|a| (a.metadata().name.clone(), a.metadata().version))
            .collect::<Vec<_>>();
        let key = (active_adapters_cache_key, clean_path, modified);
        debug!("Cache key (with recursion): {:?}", key);
        bincode::serialize(&key).context("could not serialize path")
    } else {
        let key = (
            adapter.metadata().name.clone(),
            adapter.metadata().version,
            clean_path,
            modified,
        );
        debug!("Cache key (no recursion): {:?}", key);
        bincode::serialize(&key).context("could not serialize path")
    }
 }
 async fn adapt_caching(
    ai: AdaptInfo,
    adapter: Arc<dyn FileAdapter>,
@ -166,41 +139,44 @@ async fn adapt_caching(
        ai.filepath_hint.to_string_lossy(),
        &meta.name
    );
    let db_name = format!("{}.v{}", meta.name, meta.version);
    let cache_compression_level = ai.config.cache.compression_level;
    let cache_max_blob_len = ai.config.cache.max_blob_len;
-    let cache = if ai.is_real_file {
+    let cache = if ai.is_real_file && !ai.config.cache.disabled {
-        LmdbCache::open(&ai.config.cache)?
+        Some(open_cache_db(Path::new(&ai.config.cache.path.0)).await?)
    } else {
        None
    };
    let mut cache = cache.context("No cache?")?;
-    let cache_key: Vec<u8> =
+    let cache_key = CacheKey::new(&ai.filepath_hint, adapter.as_ref(), &active_adapters)?;
        compute_cache_key(&ai.filepath_hint, adapter.as_ref(), active_adapters)?;
    // let dbg_ctx = format!("adapter {}", &adapter.metadata().name);
-    let cached = cache.get(&db_name, &cache_key)?;
+    let cached = cache.get(&cache_key).await.context("cache.get")?;
    match cached {
        Some(cached) => Ok(Box::pin(ZstdDecoder::new(Cursor::new(cached)))),
        None => {
            debug!("cache MISS, running adapter with caching...");
-            let inp = loop_adapt(adapter.as_ref(), detection_reason, ai)?;
+            let inp = loop_adapt(adapter.as_ref(), detection_reason, ai).await?;
            let inp = concat_read_streams(inp);
            let inp = async_read_and_write_to_cache(
                inp,
                cache_max_blob_len.0,
                cache_compression_level.0,
                Box::new(move |(uncompressed_size, compressed)| {
                    Box::pin(async move {
                        debug!(
                            "uncompressed output: {}",
                            print_bytes(uncompressed_size as f64)
                        );
                        if let Some(cached) = compressed {
                            debug!("compressed output: {}", print_bytes(cached.len() as f64));
-                        cache.set(&db_name, &cache_key, &cached)?
+                            cache
                                .set(&cache_key, cached)
                                .await
                                .context("writing to cache")?
                        }
                        Ok(())
                    })
                }),
            )?;
@ -213,21 +189,34 @@ pub fn loop_adapt(
    adapter: &dyn FileAdapter,
    detection_reason: FileMatcher,
    ai: AdaptInfo,
 ) -> Pin<Box<dyn Future<Output = anyhow::Result<AdaptedFilesIterBox>> + Send + '_>> {
    Box::pin(async move { loop_adapt_inner(adapter, detection_reason, ai).await })
 }
 pub async fn loop_adapt_inner(
    adapter: &dyn FileAdapter,
    detection_reason: FileMatcher,
    ai: AdaptInfo,
 ) -> anyhow::Result<AdaptedFilesIterBox> {
    let fph = ai.filepath_hint.clone();
-    let inp = adapter.adapt(ai, &detection_reason).with_context(|| {
+    let inp = adapter.adapt(ai, &detection_reason).await;
    let inp = if adapter.metadata().name == "postprocprefix" {
        // don't add confusing error context
        inp?
    } else {
        inp.with_context(|| {
            format!(
                "adapting {} via {} failed",
                fph.to_string_lossy(),
                adapter.metadata().name
            )
-    })?;
+        })?
    };
    let s = stream! {
        for await file in inp {
            match buf_choose_adapter(file?).await? {
                Ret::Recurse(ai, adapter, detection_reason, _active_adapters) => {
                    if ai.archive_recursion_depth >= ai.config.max_archive_recursion.0 {
-                        let s = format!("{}[rga: max archive recursion reached ({})]", ai.line_prefix, ai.archive_recursion_depth).into_bytes();
+                        let s = format!("{}[rga: max archive recursion reached ({})]\n", ai.line_prefix, ai.archive_recursion_depth).into_bytes();
                        yield Ok(AdaptInfo {
                            inp: Box::pin(Cursor::new(s)),
                            ..ai
@ -243,7 +232,7 @@ pub fn loop_adapt(
                        ai.filepath_hint.to_string_lossy(),
                        &adapter.metadata().name
                    );
-                    for await ifile in loop_adapt(adapter.as_ref(), detection_reason, ai)? {
+                    for await ifile in loop_adapt(adapter.as_ref(), detection_reason, ai).await? {
                        yield ifile;
                    }
                }
--- a/src/preproc_cache.rs
+++ b/src/preproc_cache.rs
@ -1,135 +1,188 @@
-use crate::{config::CacheConfig, print_bytes, print_dur};
+use crate::{adapters::FileAdapter, preproc::ActiveAdapters};
-use anyhow::{format_err, Context, Result};
+use anyhow::{Context, Result};
-use log::*;
+use path_clean::PathClean;
-use rkv::backend::{BackendEnvironmentBuilder, LmdbEnvironment};
+use rusqlite::{named_params, OptionalExtension};
-use std::{fmt::Display, path::Path, time::Instant};
+use std::{path::Path, time::UNIX_EPOCH};
 use tokio_rusqlite::Connection;
-pub trait PreprocCache: Send + Sync {
+#[derive(Clone)]
-    /*/// gets cache at specified key.
+pub struct CacheKey {
-    /// if cache hit, return the resulting data
+    adapter: String,
-    /// else, run the given lambda, and store its result in the cache if present
+    adapter_version: i32,
-    fn get_or_run<'a>(
+    active_adapters: String,
-        &mut self,
+    file_path: String,
-        db_name: &str,
+    file_mtime_unix_ms: i64,
        key: &[u8],
        debug_name: &str,
        runner: Box<dyn FnOnce() -> Result<Option<Vec<u8>>> + 'a>,
    ) -> Result<Option<Vec<u8>>>;*/
    fn get(&self, db_name: &str, key: &[u8]) -> Result<Option<Vec<u8>>>;
    fn set(&mut self, db_name: &str, key: &[u8], value: &[u8]) -> Result<()>;
 }
-
+impl CacheKey {
-/// opens a LMDB cache
+    pub fn new(
-fn open_cache_db(
+        filepath_hint: &Path,
-    path: &Path,
+        adapter: &dyn FileAdapter,
-) -> Result<std::sync::Arc<std::sync::RwLock<rkv::Rkv<LmdbEnvironment>>>> {
+        active_adapters: &ActiveAdapters,
-    std::fs::create_dir_all(path)?;
+    ) -> Result<CacheKey> {
-    // use rkv::backend::LmdbEnvironmentFlags;
+        let meta = std::fs::metadata(filepath_hint)
-
+            .with_context(|| format!("reading metadata for {}", filepath_hint.to_string_lossy()))?;
-    rkv::Manager::<LmdbEnvironment>::singleton()
+        let modified = meta.modified().expect("weird OS that can't into mtime");
-        .write()
+        let file_mtime_unix_ms = modified.duration_since(UNIX_EPOCH)?.as_millis() as i64;
-        .map_err(|_| format_err!("could not write cache db manager"))?
+        let active_adapters = if adapter.metadata().recurses {
-        .get_or_create(path, |p| {
+            serde_json::to_string(
-            let mut builder = rkv::Rkv::environment_builder::<rkv::backend::Lmdb>();
+                &active_adapters
-            builder
+                    .iter()
-                .set_flags(rkv::EnvironmentFlags::NO_SYNC)
+                    .map(|a| format!("{}.v{}", a.metadata().name, a.metadata().version))
-                .set_flags(rkv::EnvironmentFlags::WRITE_MAP) // not durable cuz it's a cache
+                    .collect::<Vec<_>>(),
-                // i'm not sure why NO_TLS is needed. otherwise LMDB transactions (open readers) will keep piling up until it fails with
+            )?
-                // LmdbError(ReadersFull). Those "open readers" stay even after the corresponding processes exit.
+        } else {
-                // hope setting this doesn't break integrity
+            "null".to_string()
-                .set_flags(rkv::EnvironmentFlags::NO_TLS)
+        };
-                // sometimes, this seems to cause the data.mdb file to appear as 2GB in size (with holes), but sometimes not?
+        Ok(CacheKey {
-                .set_map_size(2 * 1024 * 1024 * 1024)
+            adapter: adapter.metadata().name.clone(),
-                .set_max_dbs(100)
+            adapter_version: adapter.metadata().version,
-                .set_max_readers(128);
+            file_path: filepath_hint.clean().to_string_lossy().to_string(),
-            rkv::Rkv::from_builder(p, builder)
+            file_mtime_unix_ms,
            active_adapters,
        })
        .map_err(|e| format_err!("could not get/create cache db: {}", e))
 }
 pub struct LmdbCache {
    db_arc: std::sync::Arc<std::sync::RwLock<rkv::Rkv<LmdbEnvironment>>>,
 }
 impl LmdbCache {
    pub fn open(config: &CacheConfig) -> Result<Option<LmdbCache>> {
        if config.disabled {
            return Ok(None);
        }
        let path = Path::new(&config.path.0);
        Ok(Some(LmdbCache {
            db_arc: open_cache_db(path)?,
        }))
    }
 }
-#[derive(Debug)]
+#[async_trait::async_trait]
-struct RkvErrWrap(rkv::StoreError);
+pub trait PreprocCache {
-impl Display for RkvErrWrap {
+    async fn get(&self, key: &CacheKey) -> Result<Option<Vec<u8>>>;
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    async fn set(&mut self, key: &CacheKey, value: Vec<u8>) -> Result<()>;
-        self.0.fmt(f)
+}
 async fn connect_pragmas(db: &Connection) -> Result<()> {
    // https://phiresky.github.io/blog/2020/sqlite-performance-tuning/
    //let want_page_size = 32768;
    //db.execute(&format!("pragma page_size = {};", want_page_size))
    //    .context("setup pragma 1")?;
    db.call(|db| {
        db.execute_batch(
            "
    pragma journal_mode = WAL;
    pragma foreign_keys = on;
    pragma temp_store = memory;
    pragma synchronous = off; -- integrity isn't very important here
    pragma mmap_size = 30000000000;
    create table if not exists preproc_cache (
        adapter text not null,
        adapter_version integer not null,
        created_unix_ms integer not null default (unixepoch() * 1000),
        active_adapters text not null, -- 'null' if adapter cannot recurse
        file_path text not null,
        file_mtime_unix_ms integer not null,
        text_content_zstd blob not null
    ) strict;
    create unique index if not exists preproc_cache_idx on preproc_cache (adapter, adapter_version, file_path, active_adapters);
    ",
        )
    })
    .await.context("connect_pragmas")?;
    let jm: i64 = db
        .call(|db| db.pragma_query_value(None, "application_id", |r| r.get(0)))
        .await?;
    if jm != 924716026 {
        // (probably) newly created db
        create_pragmas(db).await.context("create_pragmas")?;
    }
    Ok(())
 }
 async fn create_pragmas(db: &Connection) -> Result<()> {
    db.call(|db| {
        db.execute_batch(
            "
        pragma application_id = 924716026;
        pragma user_version = 2; -- todo: on upgrade clear db if version is unexpected
        ",
        )
    })
    .await?;
    Ok(())
 }
 struct SqliteCache {
    db: Connection,
 }
 impl SqliteCache {
    async fn new(path: &Path) -> Result<SqliteCache> {
        let db = Connection::open(path.join("cache.sqlite3")).await?;
        connect_pragmas(&db).await?;
        Ok(SqliteCache { db })
    }
 }
 impl std::error::Error for RkvErrWrap {}
-impl PreprocCache for LmdbCache {
+#[async_trait::async_trait]
-    fn get(&self, db_name: &str, key: &[u8]) -> Result<Option<Vec<u8>>> {
+impl PreprocCache for SqliteCache {
-        let start = Instant::now();
+    async fn get(&self, key: &CacheKey) -> Result<Option<Vec<u8>>> {
-        let db_env = self
+        let key = (*key).clone(); // todo: without cloning
-            .db_arc
+        Ok(self
-            .read()
+            .db
-            .map_err(|_| anyhow::anyhow!("Could not open lock, some lock writer panicked"))?;
+            .call(move |db| {
-        let db = db_env
+                db.query_row(
-            .open_single(db_name, rkv::store::Options::create())
+                    "select text_content_zstd from preproc_cache where
-            .map_err(RkvErrWrap)
+                            adapter = :adapter
-            .context("could not open cache db store")?;
+                        and adapter_version = :adapter_version
-
+                        and active_adapters = :active_adapters
-        let reader = db_env.read().expect("could not get reader");
+                        and file_path = :file_path
-        let cached = db
+                        and file_mtime_unix_ms = :file_mtime_unix_ms
-            .get(&reader, key)
+                ",
-            .map_err(RkvErrWrap)
+                    named_params! {
-            .context("could not read from db")?;
+                        ":adapter": &key.adapter,
-
+                        ":adapter_version": &key.adapter_version,
-        match cached {
+                        ":active_adapters": &key.active_adapters,
-            Some(rkv::Value::Blob(cached)) => {
+                        ":file_path": &key.file_path,
-                debug!(
+                        ":file_mtime_unix_ms": &key.file_mtime_unix_ms
-                    "cache HIT, reading {} (compressed) from cache",
+                    },
-                    print_bytes(cached.len() as f64)
+                    |r| r.get::<_, Vec<u8>>(0),
-                );
+                )
-                debug!("reading from cache took {}", print_dur(start));
+                .optional()
-                Ok(Some(Vec::from(cached)))
+            })
            .await
            .context("reading from cache")?)
    }
-            Some(_) => Err(format_err!("Integrity: value not blob"))?,
+
-            None => Ok(None),
+    async fn set(&mut self, key: &CacheKey, value: Vec<u8>) -> Result<()> {
        let key = (*key).clone(); // todo: without cloning
        Ok(self
            .db
            .call(move |db| {
                db.execute(
                    "insert into preproc_cache (adapter, adapter_version, active_adapters, file_path, file_mtime_unix_ms, text_content_zstd) values
                        (:adapter, :adapter_version, :active_adapters, :file_path, :file_mtime_unix_ms, :text_content_zstd)
                    on conflict (adapter, adapter_version, active_adapters, file_path) do update set
                        file_mtime_unix_ms = :file_mtime_unix_ms,
                        created_unix_ms = unixepoch() * 1000,
                        text_content_zstd = :text_content_zstd",
                    named_params! {
                        ":adapter": &key.adapter,
                        ":adapter_version": &key.adapter_version,
                        ":active_adapters": &key.active_adapters,
                        ":file_path": &key.file_path,
                        ":file_mtime_unix_ms": &key.file_mtime_unix_ms,
                        ":text_content_zstd": value
                    }
                ).map(|_| ())
            })
            .await?)
    }
 }
-    fn set(&mut self, db_name: &str, key: &[u8], got: &[u8]) -> Result<()> {
+/// opens a default cache
-        let start = Instant::now();
+pub async fn open_cache_db(path: &Path) -> Result<impl PreprocCache> {
-        debug!("writing {} to cache", print_bytes(got.len() as f64));
+    std::fs::create_dir_all(path)?;
-        let db_env = self
+    SqliteCache::new(path).await
-            .db_arc
+}
            .read()
            .map_err(|_| anyhow::anyhow!("Could not open lock, some lock writer panicked"))?;
-        let db = db_env
+#[cfg(test)]
-            .open_single(db_name, rkv::store::Options::create())
+mod test {
            .map_err(RkvErrWrap)
            .context("could not open cache db store")?;
-        let mut writer = db_env
+    use crate::preproc_cache::*;
            .write()
            .map_err(RkvErrWrap)
            .with_context(|| format_err!("could not open write handle to cache"))?;
-        db.put(&mut writer, key, &rkv::Value::Blob(got))
+    #[tokio::test]
-            .map_err(RkvErrWrap)
+    async fn test_read_write() -> anyhow::Result<()> {
-            .with_context(|| format_err!("could not write to cache"))?;
+        let path = tempfile::tempdir()?;
-        writer
+        let _db = open_cache_db(&path.path().join("foo.sqlite3")).await?;
-            .commit()
+        // db.set();
            .map_err(RkvErrWrap)
            .with_context(|| "could not write cache".to_string())?;
        debug!("writing to cache took {}", print_dur(start));
        Ok(())
    }
 }