Merge remote-tracking branch 'upstream/master' into mbox-extractor

2024-11-08 14:00:37 +00:00 · 2023-07-31 14:34:18 +02:00 · 2023-07-31 14:34:18 +02:00 · 2259730c67
commit 2259730c67
parent 9642552fa3 16b2059851
26 changed files with 1358 additions and 1298 deletions
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@ -0,0 +1,27 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: bug
+assignees: ''
+
+---
+
+**Describe the bug**
+
+
+**To Reproduce**
+
+Attach example file:
+
+Run command:
+
+**Output**
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Operating System and Version**
+
+
+**Output of `rga --version`**
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -1,75 +1,25 @@
 # Based on https://github.com/actions-rs/meta/blob/master/recipes/quickstart.md
 #
-# While our "example" application has the platform-specific code,
-# for simplicity we are compiling and testing everything on the Ubuntu environment only.
-# For multi-OS testing see the `cross.yml` workflow.
+# While our "example" application has platform-specific code,
+# for simplicity we are compiling and testing everything in a nix-on-Linux environment only.

 on: [push, pull_request]

 name: ci

 jobs:
-  check:
-    name: Check
+  nix-flake-check:
+    name: nix flake check
    runs-on: ubuntu-latest
    steps:
      - name: Checkout sources
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3

-      - name: Install stable toolchain
-        uses: actions-rs/toolchain@v1
-        with:
-          profile: minimal
-          toolchain: stable
-          override: true
+      - name: Install nix
+        uses: cachix/install-nix-action@v21

-      - name: Run cargo check
-        uses: actions-rs/cargo@v1
-        with:
-          command: check
+      - name: Ensure the build succeeds
+        run: nix build

-  test:
-    name: Test Suite
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout sources
-        uses: actions/checkout@v2
-
-      - name: Install stable toolchain
-        uses: actions-rs/toolchain@v1
-        with:
-          profile: minimal
-          toolchain: stable
-          override: true
-
-      - name: Run cargo test
-        uses: actions-rs/cargo@v1
-        with:
-          command: test
-
-  lints:
-    name: Lints
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout sources
-        uses: actions/checkout@v2
-
-      - name: Install stable toolchain
-        uses: actions-rs/toolchain@v1
-        with:
-          profile: minimal
-          toolchain: stable
-          override: true
-          components: rustfmt, clippy
-
-      - name: Run cargo fmt
-        uses: actions-rs/cargo@v1
-        with:
-          command: fmt
-          args: --all -- --check
-
-      - name: Run cargo clippy
-        uses: actions-rs/cargo@v1
-        with:
-          command: clippy
-          args: -- -D warnings
+      - name: Run `nix flake check` to run formatters, linters, and tests
+        run: nix flake check --print-build-logs
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -18,7 +18,7 @@ on:
    # branches:
    # - ag/release
    tags:
-    - 'v[0-9]+.[0-9]+.[0-9]+'
+      - "v[0-9]+.[0-9]+.[0-9]+*"
 jobs:
  create-release:
    name: create-release
@ -62,7 +62,7 @@ jobs:

  build-release:
    name: build-release
-    needs: ['create-release']
+    needs: ["create-release"]
    runs-on: ${{ matrix.os }}
    env:
      # For some builds, we use cross to test on 32-bit and big-endian
@ -79,11 +79,11 @@ jobs:
        build: [linux, linux-arm, macos, win-msvc]
        include:
          - build: linux
-          os: ubuntu-18.04
+            os: ubuntu-22.04
            rust: nightly
            target: x86_64-unknown-linux-musl
          - build: linux-arm
-          os: ubuntu-18.04
+            os: ubuntu-22.04
            rust: nightly
            target: arm-unknown-linux-gnueabihf
          - build: macos
@ -106,7 +106,7 @@ jobs:
          fetch-depth: 1

      - name: Install packages (Ubuntu)
-      if: matrix.os == 'ubuntu-18.04'
+        if: matrix.os == 'ubuntu-22.04'
        run: |
          ci/ubuntu-install-packages

@ -124,7 +124,7 @@ jobs:
          target: ${{ matrix.target }}

      - name: Use Cross
-      # if: matrix.os != 'windows-2019'
+        shell: bash
        run: |
          cargo install cross
          echo "CARGO=cross" >> $GITHUB_ENV
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -2,7 +2,7 @@
 [package]
 authors = ["phiresky <phireskyde+git@gmail.com>"]
 description = "rga: ripgrep, but also search in PDFs, E-Books, Office documents, zip, tar.gz, etc."
-edition = "2018"
+edition = "2021"
 exclude = [
  "exampledir/*",
 ]
@ -11,56 +11,57 @@ license = "AGPL-3.0-or-later"
 name = "ripgrep_all"
 readme = "README.md"
 repository = "https://github.com/phiresky/ripgrep-all"
-version = "1.0.0-alpha.2"
+version = "1.0.0-alpha.5"

 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
-anyhow = "1.0.32"
-async-compression = {version = "0.3.15", features = ["all", "all-algorithms", "tokio"]}
-async-stream = "0.3.3"
-async-trait = "0.1.64"
-async_zip = "0.0.9"
-bincode = "1.3.1"
-bytes = "1.2.1"
-clap = {version = "4.0.18", features = ["wrap_help"]}
-crossbeam = "0.8.1"
-crossbeam-channel = "0.5.1"
-derive_more = "0.99.9"
+anyhow = {version = "1.0.71", features = ["backtrace"]}
+async-compression = { version = "0.4.0", features = ["all", "all-algorithms", "tokio"] }
+async-stream = "0.3.5"
+async-trait = "0.1.68"
+async_zip = {version = "0.0.12", features = ["full"]}
+bincode = "1.3.3"
+bytes = "1.4.0"
+clap = {version = "4.3.0", features = ["wrap_help"]}
+crossbeam = "0.8.2"
+crossbeam-channel = "0.5.8"
+derive_more = "0.99.17"
 directories-next = "2.0.0"
 dyn-clonable = "0.9.0"
-dyn-clone = "1.0.2"
-encoding_rs = "0.8.24"
+dyn-clone = "1.0.11"
+encoding_rs = "0.8.32"
 encoding_rs_io = "0.1.7"
-env_logger = "0.9.0"
-glob = "0.3.0"
+env_logger = "0.10.0"
+glob = "0.3.1"
 json_comments = "0.2.1"
 lazy_static = "1.4.0"
-log = "0.4.11"
+log = "0.4.17"
 mailbox = "0.2.0"
 mailparse = "0.14.0"
-memchr = "2.3.3"
+memchr = "2.5.0"
 mime2ext = "0.1.52"
-paste = "1.0.0"
-path-clean = "0.1.0"
+paste = "1.0.12"
+path-clean = "1.0.1"
 pretty-bytes = "0.2.2"
-regex = "1.3.9"
-rkv = "0.17"
-rusqlite = {version = "0.28.0", features = ["vtab", "bundled"]}
-schemars = {version = "0.8.0-alpha-4", features = ["preserve_order"]}
-serde = {version = "1.0.115", features = ["derive"]}
-serde_json = "1.0.57"
+regex = "1.8.2"
+rusqlite = {version = "0.29.0", features = ["vtab", "bundled"]}
+schemars = {version = "0.8.12", features = ["preserve_order"]}
+serde = {version = "1.0.163", features = ["derive"]}
+serde_json = "1.0.96"
 size_format = "1.0.2"
-structopt = "0.3.17"
-tempfile = "3.1.0"
-tokio = {version = "1.21.2", features = ["full"]}
-tokio-stream = {version = "0.1.11", features = ["io-util", "tokio-util"]}
+structopt = "0.3.26"
+tempfile = "3.5.0"
+tokio = {version = "1.28.1", features = ["full"]}
+tokio-rusqlite = "0.4.0"
+tokio-stream = {version = "0.1.14", features = ["io-util", "tokio-util"]}
 tokio-tar = { git = "https://github.com/vorot93/tokio-tar", version = "0.3.0" }
-tokio-util = {version = "0.7.4", features = ["io", "full"]}
-tree_magic = {package = "tree_magic_mini", version = "3.0.0"}
+tokio-util = {version = "0.7.8", features = ["io", "full"]}
+tree_magic = {package = "tree_magic_mini", version = "3.0.3"}

 [dev-dependencies]
-async-recursion = "1.0.0"
-ctor = "0.1.20"
+async-recursion = "1.0.4"
+ctor = "0.2.0"
 pretty_assertions = "1.3.0"
+tempfile = "3.5.0"
 tokio-test = "0.4.2"
--- a/README.md
+++ b/README.md
@ -33,45 +33,7 @@ demo/

 ![rga-fzf](doc/rga-fzf.gif)

-You can use rga interactively via fzf. Add the following to your ~/.{bash,zsh}rc:
-
-```bash
-rga-fzf() {
-	RG_PREFIX="rga --files-with-matches"
-	local file
-	file="$(
-		FZF_DEFAULT_COMMAND="$RG_PREFIX '$1'" \
-			fzf --sort --preview="[[ ! -z {} ]] && rga --pretty --context 5 {q} {}" \
-				--phony -q "$1" \
-				--bind "change:reload:$RG_PREFIX {q}" \
-				--preview-window="70%:wrap"
-	)" &&
-	echo "opening $file" &&
-	xdg-open "$file"
-}
-```
-
-And for your `~/.config/fish/config.fish`:
-```
-function rga-fzf
-    set RG_PREFIX 'rga --files-with-matches'
-    if test (count $argv) -gt 1
-        set RG_PREFIX "$RG_PREFIX $argv[1..-2]"
-    end
-    set -l file $file
-    set file (
-        FZF_DEFAULT_COMMAND="$RG_PREFIX '$argv[-1]'" \
-        fzf --sort \
-            --preview='test ! -z {} && \
-                rga --pretty --context 5 {q} {}' \
-            --phony -q "$argv[-1]" \
-            --bind "change:reload:$RG_PREFIX {q}" \
-            --preview-window='50%:wrap'
-    ) && \
-    echo "opening $file" && \
-    open "$file"
-end
-```
+See [the wiki](https://github.com/phiresky/ripgrep-all/wiki/fzf-Integration) for instructions of integrating rga with fzf.

 ## INSTALLATION

@ -86,9 +48,11 @@ Linux x64, macOS and Windows binaries are available [in GitHub Releases][latestr
 `pacman -S ripgrep-all`.

 #### Nix
+
 `nix-env -iA nixpkgs.ripgrep-all`

 #### Debian-based
+
 download the [rga binary][latestrelease] and get the dependencies like this:

 `apt install ripgrep pandoc poppler-utils ffmpeg`
@ -117,7 +81,7 @@ If you get an error like `VCRUNTIME140.DLL could not be found`, you need to inst

 To install the dependencies that are each not strictly necessary but very useful:

-`brew install pandoc poppler tesseract ffmpeg`
+`brew install pandoc poppler ffmpeg`

 ### Compile from source

@ -131,58 +95,58 @@ rga should compile with stable Rust (v1.36.0+, check with `rustc --version`). To

 ## Available Adapters

+rga works with _adapters_ that adapt various file formats. It comes with a few adapters integrated:
+
 ```
 rga --rga-list-adapters
 ```

+You can also add **custom adapters**. See [the wiki](https://github.com/phiresky/ripgrep-all/wiki) for more information.
+
 <!-- this part generated by update-readme.sh -->

 Adapters:

-   **ffmpeg**
-    Uses ffmpeg to extract video metadata/chapters and subtitles.  
-     Extensions: `.mkv`, `.mp4`, `.avi`
-
-*   **pandoc**
-    Uses pandoc to convert binary/unreadable text documents to plain markdown-like text.  
-     Extensions: `.epub`, `.odt`, `.docx`, `.fb2`, `.ipynb`
+- **pandoc**
+  Uses pandoc to convert binary/unreadable text documents to plain markdown-like text
+  Runs: pandoc --from= --to=plain --wrap=none --markdown-headings=atx  
+   Extensions: .epub, .odt, .docx, .fb2, .ipynb

 - **poppler**
-    Uses pdftotext (from poppler-utils) to extract plain text from PDF files.  
-     Extensions: `.pdf`  
-     Mime Types: `application/pdf`
+  Uses pdftotext (from poppler-utils) to extract plain text from PDF files
+  Runs: pdftotext - -  
+   Extensions: .pdf  
+   Mime Types: application/pdf
+
+- **postprocpagebreaks**
+  Adds the page number to each line for an input file that specifies page breaks as ascii page break character.
+  Mainly to be used internally by the poppler adapter.  
+   Extensions: .asciipagebreaks
+
+- **ffmpeg**
+  Uses ffmpeg to extract video metadata/chapters, subtitles, lyrics, and other metadata  
+   Extensions: .mkv, .mp4, .avi, .mp3, .ogg, .flac, .webm

 - **zip**
-    Reads a zip file as a stream and recurses down into its contents.  
-     Extensions: `.zip`  
-     Mime Types: `application/zip`
+  Reads a zip file as a stream and recurses down into its contents  
+   Extensions: .zip, .jar  
+   Mime Types: application/zip

 - **decompress**
  Reads compressed file as a stream and runs a different extractor on the contents.  
-     Extensions: `.tgz`, `.tbz`, `.tbz2`, `.gz`, `.bz2`, `.xz`, `.zst`  
-     Mime Types: `application/gzip`, `application/x-bzip`, `application/x-xz`, `application/zstd`
+   Extensions: .tgz, .tbz, .tbz2, .gz, .bz2, .xz, .zst  
+   Mime Types: application/gzip, application/x-bzip, application/x-xz, application/zstd

 - **tar**
-    Reads a tar file as a stream and recurses down into its contents.  
-     Extensions: `.tar`
+  Reads a tar file as a stream and recurses down into its contents  
+   Extensions: .tar

-*   **sqlite**
-    Uses sqlite bindings to convert sqlite databases into a simple plain text format.  
-     Extensions: `.db`, `.db3`, `.sqlite`, `.sqlite3`  
-     Mime Types: `application/x-sqlite3`
+- **sqlite**
+  Uses sqlite bindings to convert sqlite databases into a simple plain text format  
+   Extensions: .db, .db3, .sqlite, .sqlite3  
+   Mime Types: application/x-sqlite3

-The following adapters are disabled by default, and can be enabled using `--rga-adapters=+pdfpages,tesseract`:
-
-   **pdfpages**
-    Converts a pdf to its individual pages as png files. Only useful in combination with tesseract.  
-     Extensions: `.pdf`  
-     Mime Types: `application/pdf`
-
-   **tesseract**
-    Uses tesseract to run OCR on images to make them searchable.
-    May need `-j1` to prevent overloading the system.
-    Make sure you have tesseract installed.  
-     Extensions: `.jpg`, `.png`
+The following adapters are disabled by default, and can be enabled using '--rga-adapters=+foo,bar':

 ## USAGE:

@ -202,6 +166,17 @@ The following adapters are disabled by default, and can be enabled using `--rga-
 > Detection is only done on the first 8KiB of the file, since we can\'t
 > always seek on the input (in archives).

+**\--rga-no-cache**
+
+> Disable caching of results
+>
+> By default, rga caches the extracted text, if it is small enough, to a
+> database in \${XDG*CACHE_DIR-\~/.cache}/ripgrep-all on Linux,
+> *\~/Library/Caches/ripgrep-all\_ on macOS, or
+> C:\\Users\\username\\AppData\\Local\\ripgrep-all on Windows. This way,
+> repeated searches on the same set of files will be much faster. If you
+> pass this flag, all caching will be disabled.
+
 **-h**, **\--help**

 > Prints help information
@ -210,15 +185,9 @@ The following adapters are disabled by default, and can be enabled using `--rga-

 > List all known adapters

-**\--rga-no-cache**
+**\--rga-print-config-schema**

-> Disable caching of results
->
-> By default, rga caches the extracted text, if it is small enough, to a
-> database in \~/.cache/rga on Linux, _\~/Library/Caches/rga_ on macOS,
-> or C:\\Users\\username\\AppData\\Local\\rga on Windows. This way,
-> repeated searches on the same set of files will be much faster. If you
-> pass this flag, all caching will be disabled.
+> Print the JSON Schema of the configuration file

 **\--rg-help**

@ -242,25 +211,32 @@ The following adapters are disabled by default, and can be enabled using `--rga-
 > use all default adapters except for bar and baz. \"+bar,baz\" means
 > use all default adapters and also bar and baz.

-**\--rga-cache-compression-level=**\<cache-compression-level\>
+**\--rga-cache-compression-level=**\<compression-level\>

 > ZSTD compression level to apply to adapter outputs before storing in
 > cache db
 >
 > Ranges from 1 - 22 \[default: 12\]

-**\--rga-cache-max-blob-len=**\<cache-max-blob-len\>
-
-> Max compressed size to cache
->
-> Longest byte length (after compression) to store in cache. Longer
-> adapter outputs will not be cached and recomputed every time. Allowed
-> suffixes: k M G \[default: 2000000\]
+**\--rga-config-file=**\<config-file-path\>

 **\--rga-max-archive-recursion=**\<max-archive-recursion\>

 > Maximum nestedness of archives to recurse into \[default: 4\]

+**\--rga-cache-max-blob-len=**\<max-blob-len\>
+
+> Max compressed size to cache
+>
+> Longest byte length (after compression) to store in cache. Longer
+> adapter outputs will not be cached and recomputed every time.
+>
+> Allowed suffixes on command line: k M G \[default: 2000000\]
+
+**\--rga-cache-path=**\<path\>
+
+> Path to store cache db \[default: /home/phire/.cache/ripgrep-all\]
+
 **-h** shows a concise overview, **\--help** shows more detail and
 advanced options.

@ -287,6 +263,7 @@ to debug the adapters.

 You can use the provided [`flake.nix`](./flake.nix) to setup all build- and
 run-time dependencies:
+
 1. Enable [Flakes](https://nixos.wiki/wiki/Flakes) in your Nix configuration.
 1. Add [`direnv`](https://direnv.net/) to your profile:
   `nix profile install nixpkgs#direnv`
--- a/doc/config.default.jsonc
+++ b/doc/config.default.jsonc
@ -7,7 +7,7 @@
  // https://github.com/phiresky/ripgrep-all/blob/master/doc/config.default.jsonc

  // The config options are the same as the command line options,
-  // but with --rga- prefix removed and - replaced with _.
+  // but with --rga- prefix removed and - and . replaced with _.
  // e.g. --rga-no-cache becomes `"no_cache": true.
  // The only exception is the `custom_adapters` option, which can only be set in this file.

--- a/doc/update-readme.sh
+++ b/doc/update-readme.sh
@ -5,7 +5,7 @@ content=$(
 <!-- this part generated by update-readme.sh -->
 $(cargo run --bin rga -- --rga-list-adapters)

-$(help2man -N "cargo run --bin rga --" | pandoc -f man -t markdown --atx-headers | rg --multiline "## USAGE:(.|\n)*")
+$(help2man -N "cargo run --bin rga --" | pandoc -f man -t markdown --markdown-headings=atx | rg --multiline "## USAGE:(.|\n)*")
 <!-- end of part generated by update-readme.sh -->
 END
 )
--- a/exampledir/exif.png
+++ b/exampledir/exif.png
--- a/flake.lock
+++ b/flake.lock
@ -3,11 +3,11 @@
    "advisory-db": {
      "flake": false,
      "locked": {
-        "lastModified": 1670452192,
-        "narHash": "sha256-f8NIFbqSbCzpssgDUK4srfgKaVaMhDScEptw4uuxGAc=",
+        "lastModified": 1685821301,
+        "narHash": "sha256-4XRcnSboLJw1XKjDpg2jBU70jEw/8Bgx4nUmnq3kXbY=",
        "owner": "rustsec",
        "repo": "advisory-db",
-        "rev": "0a2faeb87195392b23333a8097309d29f2c5d31d",
+        "rev": "af3f3d503f82056785841bee49997bae65eba1c0",
        "type": "github"
      },
      "original": {
@ -26,11 +26,11 @@
        "rust-overlay": "rust-overlay"
      },
      "locked": {
-        "lastModified": 1670546681,
-        "narHash": "sha256-S33bhME0zPHPEZyZPCsrdQL/4WW/A020PwN+a3z7Q+I=",
+        "lastModified": 1684981077,
+        "narHash": "sha256-68X9cFm0RTZm8u0rXPbeBzOVUH5OoUGAfeHHVoxGd9o=",
        "owner": "ipetkov",
        "repo": "crane",
-        "rev": "63f80ee278897e72a1468090278716b5befa5128",
+        "rev": "35110cccf28823320f4fd697fcafcb5038683982",
        "type": "github"
      },
      "original": {
@ -42,11 +42,11 @@
    "flake-compat": {
      "flake": false,
      "locked": {
-        "lastModified": 1668681692,
-        "narHash": "sha256-Ht91NGdewz8IQLtWZ9LCeNXMSXHUss+9COoqu6JLmXU=",
+        "lastModified": 1673956053,
+        "narHash": "sha256-4gtG9iQuiKITOjNQQeQIpoIB6b16fm+504Ch3sNKLd8=",
        "owner": "edolstra",
        "repo": "flake-compat",
-        "rev": "009399224d5e398d03b22badca40a37ac85412a1",
+        "rev": "35bb57c0c8d8b62bbfd284272c928ceb64ddbde9",
        "type": "github"
      },
      "original": {
@ -58,11 +58,11 @@
    "flake-compat_2": {
      "flake": false,
      "locked": {
-        "lastModified": 1668681692,
-        "narHash": "sha256-Ht91NGdewz8IQLtWZ9LCeNXMSXHUss+9COoqu6JLmXU=",
+        "lastModified": 1673956053,
+        "narHash": "sha256-4gtG9iQuiKITOjNQQeQIpoIB6b16fm+504Ch3sNKLd8=",
        "owner": "edolstra",
        "repo": "flake-compat",
-        "rev": "009399224d5e398d03b22badca40a37ac85412a1",
+        "rev": "35bb57c0c8d8b62bbfd284272c928ceb64ddbde9",
        "type": "github"
      },
      "original": {
@ -72,12 +72,15 @@
      }
    },
    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
      "locked": {
-        "lastModified": 1667395993,
-        "narHash": "sha256-nuEHfE/LcWyuSWnS8t12N1wc105Qtau+/OdUAjtQ0rA=",
+        "lastModified": 1681202837,
+        "narHash": "sha256-H+Rh19JDwRtpVPAWp64F+rlEtxUWBAQW28eAi3SRSzg=",
        "owner": "numtide",
        "repo": "flake-utils",
-        "rev": "5aed5285a952e0b949eb3ba02c12fa4fcfef535f",
+        "rev": "cfacdce06f30d2b68473a46042957675eebb3401",
        "type": "github"
      },
      "original": {
@ -87,27 +90,15 @@
      }
    },
    "flake-utils_2": {
-      "locked": {
-        "lastModified": 1667395993,
-        "narHash": "sha256-nuEHfE/LcWyuSWnS8t12N1wc105Qtau+/OdUAjtQ0rA=",
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "rev": "5aed5285a952e0b949eb3ba02c12fa4fcfef535f",
-        "type": "github"
+      "inputs": {
+        "systems": "systems_2"
      },
-      "original": {
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "type": "github"
-      }
-    },
-    "flake-utils_3": {
      "locked": {
-        "lastModified": 1667395993,
-        "narHash": "sha256-nuEHfE/LcWyuSWnS8t12N1wc105Qtau+/OdUAjtQ0rA=",
+        "lastModified": 1685518550,
+        "narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=",
        "owner": "numtide",
        "repo": "flake-utils",
-        "rev": "5aed5285a952e0b949eb3ba02c12fa4fcfef535f",
+        "rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef",
        "type": "github"
      },
      "original": {
@ -139,48 +130,31 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1670525689,
-        "narHash": "sha256-YIjGzxrRQa5LYO0zlnH/ardcwXsRgsnHe3TkGkvCxbc=",
+        "lastModified": 1685860998,
+        "narHash": "sha256-ZexAPe8yvJaLvn5aVgjW0vY41RnmJGbgOdGBJk1yDIE=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "f21f11aa2a02cb78651c6d57546c7d7541f9240c",
+        "rev": "45d47b647d7bbaede5121d731cbee78f6093b6d6",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
-        "ref": "nixpkgs-unstable",
        "repo": "nixpkgs",
        "type": "github"
      }
    },
    "nixpkgs-stable": {
      "locked": {
-        "lastModified": 1668984258,
-        "narHash": "sha256-0gDMJ2T3qf58xgcSbYoXiRGUkPWmKyr5C3vcathWhKs=",
+        "lastModified": 1678872516,
+        "narHash": "sha256-/E1YwtMtFAu2KUQKV/1+KFuReYPANM2Rzehk84VxVoc=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "cf63ade6f74bbc9d2a017290f1b2e33e8fbfa70a",
+        "rev": "9b8e5abb18324c7fe9f07cb100c3cd4a29cda8b8",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
-        "ref": "nixos-22.05",
-        "repo": "nixpkgs",
-        "type": "github"
-      }
-    },
-    "nixpkgs_2": {
-      "locked": {
-        "lastModified": 1668994630,
-        "narHash": "sha256-1lqx6HLyw6fMNX/hXrrETG1vMvZRGm2XVC9O/Jt0T6c=",
-        "owner": "NixOS",
-        "repo": "nixpkgs",
-        "rev": "af50806f7c6ab40df3e6b239099e8f8385f6c78b",
-        "type": "github"
-      },
-      "original": {
-        "owner": "NixOS",
-        "ref": "nixos-unstable",
+        "ref": "nixos-22.11",
        "repo": "nixpkgs",
        "type": "github"
      }
@ -188,17 +162,21 @@
    "pre-commit-hooks": {
      "inputs": {
        "flake-compat": "flake-compat_2",
-        "flake-utils": "flake-utils_3",
+        "flake-utils": [
+          "flake-utils"
+        ],
        "gitignore": "gitignore",
-        "nixpkgs": "nixpkgs_2",
+        "nixpkgs": [
+          "nixpkgs"
+        ],
        "nixpkgs-stable": "nixpkgs-stable"
      },
      "locked": {
-        "lastModified": 1670413394,
-        "narHash": "sha256-M7sWqrKtOqUv9euX1t3HCxis8cPy9MNiZxQmUf0KF1o=",
+        "lastModified": 1685361114,
+        "narHash": "sha256-4RjrlSb+OO+e1nzTExKW58o3WRwVGpXwj97iCta8aj4=",
        "owner": "cachix",
        "repo": "pre-commit-hooks.nix",
-        "rev": "1303a1a76e9eb074075bfe566518c413f6fc104e",
+        "rev": "ca2fdbf3edda2a38140184da6381d49f8206eaf4",
        "type": "github"
      },
      "original": {
@ -229,11 +207,11 @@
        ]
      },
      "locked": {
-        "lastModified": 1670034122,
-        "narHash": "sha256-EqmuOKucPWtMvCZtHraHr3Q3bgVszq1x2PoZtQkUuEk=",
+        "lastModified": 1683080331,
+        "narHash": "sha256-nGDvJ1DAxZIwdn6ww8IFwzoHb2rqBP4wv/65Wt5vflk=",
        "owner": "oxalica",
        "repo": "rust-overlay",
-        "rev": "a0d5773275ecd4f141d792d3a0376277c0fc0b65",
+        "rev": "d59c3fa0cba8336e115b376c2d9e91053aa59e56",
        "type": "github"
      },
      "original": {
@ -252,11 +230,11 @@
        ]
      },
      "locked": {
-        "lastModified": 1670552927,
-        "narHash": "sha256-lCE51eAGrAFS4k9W5aDGFpVtOAwQQ/rFMN80PCDh0vo=",
+        "lastModified": 1685846256,
+        "narHash": "sha256-G4aYK4VqlMHImvZ0lUnLHw1A+Cx28T0sBMvAKZBcGpk=",
        "owner": "oxalica",
        "repo": "rust-overlay",
-        "rev": "a0fdafd18c9cf599fde17fbaf07dbb20fa57eecb",
+        "rev": "1ef3c6de6127a1cba94cc5492cdde52e33d06ea4",
        "type": "github"
      },
      "original": {
@ -264,6 +242,36 @@
        "repo": "rust-overlay",
        "type": "github"
      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    },
+    "systems_2": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
    }
  },
  "root": "root",
--- a/flake.nix
+++ b/flake.nix
@ -3,7 +3,7 @@
    "ripgrep, but also search in PDFs, E-Books, Office documents, zip, tar.gz, etc.";

  inputs = {
-    nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
+    nixpkgs.url = "github:NixOS/nixpkgs";

    crane = {
      url = "github:ipetkov/crane";
@ -25,7 +25,13 @@
      flake = false;
    };

-    pre-commit-hooks.url = "github:cachix/pre-commit-hooks.nix";
+    pre-commit-hooks = {
+      url = "github:cachix/pre-commit-hooks.nix";
+      inputs = {
+        nixpkgs.follows = "nixpkgs";
+        flake-utils.follows = "flake-utils";
+      };
+    };
  };

  outputs = { self, nixpkgs, crane, flake-utils, rust-overlay, advisory-db
@ -36,14 +42,16 @@
          inherit system;
          overlays = [ (import rust-overlay) ];
        };
-        inherit (pkgs) lib;

        craneLib = crane.lib.${system};
-        src = craneLib.cleanCargoSource ./.;
+        src = pkgs.lib.cleanSourceWith {
+          src = craneLib.path ./.;
+          filter = pkgs.lib.cleanSourceFilter;
+        };

        buildInputs = with pkgs;
          [ ffmpeg imagemagick pandoc poppler_utils ripgrep tesseract ]
-          ++ lib.optionals pkgs.stdenv.isDarwin [
+          ++ pkgs.lib.optionals pkgs.stdenv.isDarwin [
            # Additional darwin specific inputs can be set here
            pkgs.libiconv
          ];
@ -54,10 +62,7 @@

        # Build the actual crate itself, reusing the dependency
        # artifacts from above.
-        rga = craneLib.buildPackage {
-          inherit cargoArtifacts src buildInputs;
-          doCheck = false;
-        };
+        rga = craneLib.buildPackage { inherit cargoArtifacts src buildInputs; };

        pre-commit = pre-commit-hooks.lib."${system}".run;
      in {
@ -97,18 +102,20 @@
            hooks = {
              nixfmt.enable = true;
              rustfmt.enable = true;
-              cargo-check.enable = true;
+              typos = {
+                enable = true;
+                types = [ "text" ];
+                excludes = [ "exampledir/.*" ];
+              };
            };
          };
-        } // lib.optionalAttrs (system == "x86_64-linux") {
-          # NB: cargo-tarpaulin only supports x86_64 systems
-          # Check code coverage (note: this will not upload coverage anywhere)
-          rga-coverage =
-            craneLib.cargoTarpaulin { inherit cargoArtifacts src; };
        };

        # `nix build`
-        packages.default = rga;
+        packages = {
+          inherit rga; # `nix build .#rga`
+          default = rga; # `nix build`
+        };

        # `nix run`
        apps.default = flake-utils.lib.mkApp { drv = rga; };
--- a/src/adapters.rs
+++ b/src/adapters.rs
@ -10,6 +10,7 @@ pub mod writing;
 pub mod zip;
 use crate::{adapted_iter::AdaptedFilesIterBox, config::RgaConfig, matching::*};
 use anyhow::{format_err, Context, Result};
+use async_trait::async_trait;
 use custom::CustomAdapterConfig;
 use custom::BUILTIN_SPAWNING_ADAPTERS;
 use log::*;
@ -77,11 +78,17 @@ impl AdapterMeta {
 pub trait GetMetadata {
    fn metadata(&self) -> &AdapterMeta;
 }
+
+#[async_trait]
 pub trait FileAdapter: GetMetadata + Send + Sync {
    /// adapt a file.
    ///
    /// detection_reason is the Matcher that was used to identify this file. Unless --rga-accurate was given, it is always a FastMatcher
-    fn adapt(&self, a: AdaptInfo, detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox>;
+    async fn adapt(
+        &self,
+        a: AdaptInfo,
+        detection_reason: &FileMatcher,
+    ) -> Result<AdaptedFilesIterBox>;
 }

 pub struct AdaptInfo {
--- a/src/adapters/custom.rs
+++ b/src/adapters/custom.rs
@ -49,8 +49,9 @@ pub struct CustomAdapterConfig {
    pub args: Vec<String>,
    /// The output path hint. The placeholders are the same as for `.args`
    ///
-    /// If not set, defaults to ${input_virtual_path}.txt
+    /// If not set, defaults to "${input_virtual_path}.txt"
    ///
+    /// Setting this is useful if the output format is not plain text (.txt) but instead some other format that should be passed to another adapter
    pub output_path_hint: Option<String>,
 }

@ -128,7 +129,6 @@ lazy_static! {
            disabled_by_default: None,
            match_only_by_mime: None,
            output_path_hint: Some("${input_virtual_path}.txt.asciipagebreaks".into())
-            // postprocessors: [{name: "add_page_numbers_by_pagebreaks"}]
        }
    ];
 }
@ -143,15 +143,13 @@ pub fn map_exe_error(err: std::io::Error, exe_name: &str, help: &str) -> anyhow:
    }
 }

-fn proc_wait(mut child: Child) -> impl AsyncRead {
+fn proc_wait(mut child: Child, context: impl FnOnce() -> String) -> impl AsyncRead {
    let s = stream! {
        let res = child.wait().await?;
        if res.success() {
            yield std::io::Result::Ok(Bytes::new());
        } else {
-            yield std::io::Result::Err(to_io_err(
-                format_err!("subprocess failed: {:?}", res),
-            ));
+            Err(format_err!("{:?}", res)).with_context(context).map_err(to_io_err)?;
        }
    };
    StreamReader::new(s)
@ -164,6 +162,7 @@ pub fn pipe_output(
    exe_name: &str,
    help: &str,
 ) -> Result<ReadBox> {
+    let cmd_log = format!("{:?}", cmd); // todo: perf
    let mut cmd = cmd
        .stdin(Stdio::piped())
        .stdout(Stdio::piped())
@ -177,10 +176,9 @@ pub fn pipe_output(
        tokio::io::copy(&mut z, &mut stdi).await?;
        std::io::Result::Ok(())
    });
-
-    Ok(Box::pin(
-        stdo.chain(proc_wait(cmd).chain(join_handle_to_stream(join))),
-    ))
+    Ok(Box::pin(stdo.chain(
+        proc_wait(cmd, move || format!("subprocess: {cmd_log}")).chain(join_handle_to_stream(join)),
+    )))
 }

 pub struct CustomSpawningFileAdapter {
@ -224,8 +222,9 @@ impl CustomSpawningFileAdapter {
        Ok(command)
    }
 }
+#[async_trait]
 impl FileAdapter for CustomSpawningFileAdapter {
-    fn adapt<'a>(
+    async fn adapt(
        &self,
        ai: AdaptInfo,
        _detection_reason: &FileMatcher,
@ -314,7 +313,7 @@ mod test {

        let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
        // let r = adapter.adapt(a, &d)?;
-        let r = loop_adapt(&adapter, d, a)?;
+        let r = loop_adapt(&adapter, d, a).await?;
        let o = adapted_to_vec(r).await?;
        assert_eq!(
            String::from_utf8(o)?,
@ -368,7 +367,7 @@ PREFIX:Page 1:
            Path::new("foo.txt"),
            Box::pin(Cursor::new(Vec::from(input))),
        );
-        let output = adapter.adapt(a, &d).unwrap();
+        let output = adapter.adapt(a, &d).await.unwrap();

        let oup = adapted_to_vec(output).await?;
        println!("output: {}", String::from_utf8_lossy(&oup));
--- a/src/adapters/decompress.rs
+++ b/src/adapters/decompress.rs
@ -93,8 +93,13 @@ fn get_inner_filename(filename: &Path) -> PathBuf {
    filename.with_file_name(format!("{}{}", stem, new_extension))
 }

+#[async_trait]
 impl FileAdapter for DecompressAdapter {
-    fn adapt(&self, ai: AdaptInfo, detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox> {
+    async fn adapt(
+        &self,
+        ai: AdaptInfo,
+        detection_reason: &FileMatcher,
+    ) -> Result<AdaptedFilesIterBox> {
        Ok(one_file(AdaptInfo {
            filepath_hint: get_inner_filename(&ai.filepath_hint),
            is_real_file: false,
@ -137,7 +142,7 @@ mod tests {
        let filepath = test_data_dir().join("hello.gz");

        let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
-        let r = adapter.adapt(a, &d)?;
+        let r = adapter.adapt(a, &d).await?;
        let o = adapted_to_vec(r).await?;
        assert_eq!(String::from_utf8(o)?, "hello\n");
        Ok(())
@ -150,7 +155,7 @@ mod tests {
        let filepath = test_data_dir().join("short.pdf.gz");

        let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
-        let r = loop_adapt(&adapter, d, a)?;
+        let r = loop_adapt(&adapter, d, a).await?;
        let o = adapted_to_vec(r).await?;
        assert_eq!(
            String::from_utf8(o)?,
--- a/src/adapters/ffmpeg.rs
+++ b/src/adapters/ffmpeg.rs
@ -14,13 +14,15 @@ use writing::WritingFileAdapter;
 // maybe todo: read list of extensions from
 // ffmpeg -demuxers | tail -n+5 | awk '{print $2}' | while read demuxer; do echo MUX=$demuxer; ffmpeg -h demuxer=$demuxer | grep 'Common extensions'; done 2>/dev/null
 // but really, the probability of getting useful information from a .flv is low
-static EXTENSIONS: &[&str] = &["mkv", "mp4", "avi"];
+static EXTENSIONS: &[&str] = &["mkv", "mp4", "avi", "mp3", "ogg", "flac", "webm"];

 lazy_static! {
    static ref METADATA: AdapterMeta = AdapterMeta {
        name: "ffmpeg".to_owned(),
        version: 1,
-        description: "Uses ffmpeg to extract video metadata/chapters and subtitles".to_owned(),
+        description:
+            "Uses ffmpeg to extract video metadata/chapters, subtitles, lyrics, and other metadata"
+                .to_owned(),
        recurses: false,
        fast_matchers: EXTENSIONS
            .iter()
@ -52,7 +54,7 @@ struct FFprobeOutput {
 }
 #[derive(Serialize, Deserialize)]
 struct FFprobeStream {
-    codec_type: String, // video,audio,subtitle
+    index: i32, // stream index
 }

 #[async_trait]
@ -78,17 +80,17 @@ impl WritingFileAdapter for FFmpegAdapter {
        }
        let inp_fname = filepath_hint;
        let spawn_fail = |e| map_exe_error(e, "ffprobe", "Make sure you have ffmpeg installed.");
-        let has_subtitles = {
+        let subtitle_streams = {
            let probe = Command::new("ffprobe")
                .args(vec![
                    "-v",
-                    "error",
+                    "error", // show all errors
                    "-select_streams",
-                    "s",
+                    "s", // show only subtitle streams
                    "-of",
-                    "json",
+                    "json", // use json as output format
                    "-show_entries",
-                    "stream=codec_type",
+                    "stream=index", // show index of subtitle streams
                ])
                .arg("-i")
                .arg(&inp_fname)
@ -96,10 +98,14 @@ impl WritingFileAdapter for FFmpegAdapter {
                .await
                .map_err(spawn_fail)?;
            if !probe.status.success() {
-                return Err(format_err!("ffprobe failed: {:?}", probe.status));
+                return Err(format_err!(
+                    "ffprobe failed: {:?}\n{}",
+                    probe.status,
+                    String::from_utf8_lossy(&probe.stderr)
+                ));
            }
            let p: FFprobeOutput = serde_json::from_slice(&probe.stdout)?;
-            !p.streams.is_empty()
+            p.streams
        };
        {
            // extract file metadata (especially chapter names in a greppable format)
@ -124,6 +130,7 @@ impl WritingFileAdapter for FFmpegAdapter {
                .spawn()?;
            let mut lines = BufReader::new(probe.stdout.as_mut().unwrap()).lines();
            while let Some(line) = lines.next_line().await? {
+                let line = line.replace("\\r\\n", "\n").replace("\\n", "\n"); // just unescape newlines
                async_writeln!(oup, "metadata: {line}")?;
            }
            let exit = probe.wait().await?;
@ -131,7 +138,8 @@ impl WritingFileAdapter for FFmpegAdapter {
                return Err(format_err!("ffprobe failed: {:?}", exit));
            }
        }
-        if has_subtitles {
+        if !subtitle_streams.is_empty() {
+            for probe_stream in subtitle_streams.iter() {
                // extract subtitles
                let mut cmd = Command::new("ffmpeg");
                cmd.arg("-hide_banner")
@ -139,6 +147,8 @@ impl WritingFileAdapter for FFmpegAdapter {
                    .arg("panic")
                    .arg("-i")
                    .arg(&inp_fname)
+                    .arg("-map")
+                    .arg(format!("0:{}", probe_stream.index)) // 0 for first input
                    .arg("-f")
                    .arg("webvtt")
                    .arg("-");
@ -159,6 +169,7 @@ impl WritingFileAdapter for FFmpegAdapter {
                    }
                }
            }
+        }
        Ok(())
    }
 }
--- a/src/adapters/postproc.rs
+++ b/src/adapters/postproc.rs
@ -4,7 +4,11 @@

 use anyhow::Result;
 use async_stream::stream;
+use async_trait::async_trait;
 use bytes::Bytes;
+use encoding_rs::Encoding;
+use encoding_rs_io::DecodeReaderBytesBuilder;
+use tokio_util::io::SyncIoBridge;

 use std::io::Cursor;
 use std::path::PathBuf;
@ -41,15 +45,16 @@ impl GetMetadata for PostprocPrefix {
        &METADATA
    }
 }
+#[async_trait]
 impl FileAdapter for PostprocPrefix {
-    fn adapt<'a>(
+    async fn adapt(
        &self,
        a: super::AdaptInfo,
        _detection_reason: &crate::matching::FileMatcher,
    ) -> Result<AdaptedFilesIterBox> {
        let read = add_newline(postproc_prefix(
            &a.line_prefix,
-            postproc_encoding(&a.line_prefix, a.inp)?,
+            postproc_encoding(&a.line_prefix, a.inp).await?,
        ));
        // keep adapt info (filename etc) except replace inp
        let ai = AdaptInfo {
@ -74,50 +79,53 @@ impl Read for ReadErr {
 * Detects and converts encodings other than utf-8 to utf-8.
 * If the input stream does not contain valid text, returns the string `[rga: binary data]` instead
 */
-pub fn postproc_encoding(
+async fn postproc_encoding(
    _line_prefix: &str,
-    inp: impl AsyncRead + Send + 'static,
+    inp: Pin<Box<dyn AsyncRead + Send>>,
 ) -> Result<Pin<Box<dyn AsyncRead + Send>>> {
-    Ok(Box::pin(inp))
-    // panic!("todo: implement");
-    /*// TODO: parse these options from ripgrep's configuration
+    // check for binary content in first 8kB
+    // read the first 8kB into a buffer, check for null bytes, then return the buffer concatenated with the rest of the file
+    let mut fourk = Vec::with_capacity(1 << 13);
+    let mut beginning = inp.take(1 << 13);
+
+    beginning.read_to_end(&mut fourk).await?;
+    let has_binary = fourk.contains(&0u8);
+
+    let enc = Encoding::for_bom(&fourk);
+    let inp = Cursor::new(fourk).chain(beginning.into_inner());
+    match enc {
+        Some((enc, _)) if enc != encoding_rs::UTF_8 => {
+            // detected UTF16LE or UTF16BE, convert to UTF8 in separate thread
+            // TODO: parse these options from ripgrep's configuration
            let encoding = None; // detect bom but usually assume utf8
            let bom_sniffing = true;
            let mut decode_builder = DecodeReaderBytesBuilder::new();
            // https://github.com/BurntSushi/ripgrep/blob/a7d26c8f144a4957b75f71087a66692d0b25759a/grep-searcher/src/searcher/mod.rs#L706
            // this detects utf-16 BOMs and transcodes to utf-8 if they are present
            // it does not detect any other char encodings. that would require https://github.com/hsivonen/chardetng or similar but then binary detection is hard (?)
-    let inp = decode_builder
+            let mut inp = decode_builder
                .encoding(encoding)
                .utf8_passthru(true)
                .strip_bom(bom_sniffing)
                .bom_override(true)
                .bom_sniffing(bom_sniffing)
-        .build(inp);
-
-    // check for binary content in first 8kB
-    // read the first 8kB into a buffer, check for null bytes, then return the buffer concatenated with the rest of the file
-    let mut fourk = Vec::with_capacity(1 << 13);
-    let mut beginning = inp.take(1 << 13);
-
-    beginning.read_to_end(&mut fourk)?;
-
-    if fourk.contains(&0u8) {
-        log::debug!("detected binary");
-        let v = "[rga: binary data]";
-        return Ok(Box::new(std::io::Cursor::new(v)));
-        /*let err = std::io::Error::new(
-            std::io::ErrorKind::InvalidData,
-            format!("{}[rga: binary data]", line_prefix),
-        );
-        return Err(err).context("");
-        return ReadErr {
-            err,
-        };*/
+                .build(SyncIoBridge::new(inp));
+            let oup = tokio::task::spawn_blocking(move || -> Result<Vec<u8>> {
+                let mut oup = Vec::new();
+                std::io::Read::read_to_end(&mut inp, &mut oup)?;
+                Ok(oup)
+            })
+            .await??;
+            Ok(Box::pin(Cursor::new(oup)))
+        }
+        _ => {
+            if has_binary {
+                log::debug!("detected binary");
+                return Ok(Box::pin(Cursor::new("[rga: binary data]")));
+            }
+            Ok(Box::pin(inp))
+        }
    }
-    Ok(Box::new(
-        std::io::Cursor::new(fourk).chain(beginning.into_inner()),
-    ))*/
 }

 /// Adds the given prefix to each line in an `AsyncRead`.
@ -164,13 +172,14 @@ impl GetMetadata for PostprocPageBreaks {
        &METADATA
    }
 }
+#[async_trait]
 impl FileAdapter for PostprocPageBreaks {
-    fn adapt<'a>(
+    async fn adapt(
        &self,
        a: super::AdaptInfo,
        _detection_reason: &crate::matching::FileMatcher,
    ) -> Result<AdaptedFilesIterBox> {
-        let read = postproc_pagebreaks(postproc_encoding(&a.line_prefix, a.inp)?);
+        let read = postproc_pagebreaks(postproc_encoding(&a.line_prefix, a.inp).await?);
        // keep adapt info (filename etc) except replace inp
        let ai = AdaptInfo {
            inp: Box::pin(read),
@ -282,7 +291,7 @@ mod tests {
        let fname = test_data_dir().join("twoblankpages.pdf");
        let rd = File::open(&fname).await?;
        let (a, d) = simple_adapt_info(&fname, Box::pin(rd));
-        let res = loop_adapt(&adapter, d, a)?;
+        let res = loop_adapt(&adapter, d, a).await?;

        let buf = adapted_to_vec(res).await?;

@ -327,7 +336,8 @@ PREFIX:Page 3:
        b: &str,
    ) -> Result<()> {
        let mut oup = Vec::new();
-        let inp = postproc_encoding("", a)?;
+        let inp = Box::pin(Cursor::new(a));
+        let inp = postproc_encoding("", inp).await?;
        if pagebreaks {
            postproc_pagebreaks(inp).read_to_end(&mut oup).await?;
        } else {
@ -341,6 +351,23 @@ PREFIX:Page 3:
        Ok(())
    }

+    #[tokio::test]
+    async fn test_utf16() -> Result<()> {
+        let utf16lebom: &[u8] = &[
+            0xff, 0xfe, 0x68, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x20, 0x00,
+            0x77, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x6c, 0x00, 0x64, 0x00, 0x20, 0x00, 0x3d, 0xd8,
+            0xa9, 0xdc, 0x0a, 0x00,
+        ];
+        let utf16bebom: &[u8] = &[
+            0xfe, 0xff, 0x00, 0x68, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x20,
+            0x00, 0x77, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x6c, 0x00, 0x64, 0x00, 0x20, 0xd8, 0x3d,
+            0xdc, 0xa9, 0x00, 0x0a,
+        ];
+        test_from_bytes(false, "", utf16lebom, "hello world 💩\n").await?;
+        test_from_bytes(false, "", utf16bebom, "hello world 💩\n").await?;
+        Ok(())
+    }
+
    #[tokio::test]
    async fn post1() -> Result<()> {
        let inp = "What is this\nThis is a test\nFoo";
@ -362,8 +389,7 @@ PREFIX:Page 3:

        Ok(())
    }
-    /*
-    todo: uncomment when fixed
+
    #[tokio::test]
    async fn test_binary_content() -> Result<()> {
        test_from_strs(
@ -375,7 +401,7 @@ PREFIX:Page 3:
        .await?;
        test_from_strs(false, "foo:", "\0", "foo:[rga: binary data]").await?;
        Ok(())
-     }*/
+    }

    /*#[test]
    fn chardet() -> Result<()> {
--- a/src/adapters/sqlite.rs
+++ b/src/adapters/sqlite.rs
@ -77,11 +77,13 @@ fn synchronous_dump_sqlite(ai: AdaptInfo, mut s: impl Write) -> Result<()> {
        return Ok(());
    }
    let inp_fname = filepath_hint;
-
-    let conn = Connection::open_with_flags(inp_fname, OpenFlags::SQLITE_OPEN_READ_ONLY)?;
+    let conn = Connection::open_with_flags(&inp_fname, OpenFlags::SQLITE_OPEN_READ_ONLY)
+        .with_context(|| format!("opening sqlite connection to {}", inp_fname.display()))?;
    let tables: Vec<String> = conn
-        .prepare("select name from sqlite_master where type='table'")?
-        .query_map([], |r| r.get::<_, String>(0))?
+        .prepare("select name from sqlite_master where type='table'")
+        .context("while preparing query")?
+        .query_map([], |r| r.get::<_, String>(0))
+        .context("while executing query")?
        .filter_map(|e| e.ok())
        .collect();
    debug!("db has {} tables", tables.len());
@ -121,7 +123,9 @@ impl WritingFileAdapter for SqliteAdapter {
        oup: Pin<Box<dyn AsyncWrite + Send>>,
    ) -> Result<()> {
        let oup_sync = SyncIoBridge::new(oup);
-        tokio::task::spawn_blocking(|| synchronous_dump_sqlite(ai, oup_sync)).await??;
+        tokio::task::spawn_blocking(|| synchronous_dump_sqlite(ai, oup_sync))
+            .await?
+            .context("in synchronous sqlite task")?;
        Ok(())
    }
 }
@ -134,10 +138,10 @@ mod test {

    #[tokio::test]
    async fn simple() -> Result<()> {
-        let adapter: Box<dyn FileAdapter> = Box::new(SqliteAdapter::default());
+        let adapter: Box<dyn FileAdapter> = Box::<SqliteAdapter>::default();
        let fname = test_data_dir().join("hello.sqlite3");
        let (a, d) = simple_fs_adapt_info(&fname).await?;
-        let res = adapter.adapt(a, &d)?;
+        let res = adapter.adapt(a, &d).await?;

        let buf = adapted_to_vec(res).await?;

--- a/src/adapters/tar.rs
+++ b/src/adapters/tar.rs
@ -6,6 +6,7 @@ use crate::{
 };
 use anyhow::*;
 use async_stream::stream;
+use async_trait::async_trait;
 use lazy_static::lazy_static;
 use log::*;
 use std::path::PathBuf;
@ -45,8 +46,13 @@ impl GetMetadata for TarAdapter {
    }
 }

+#[async_trait]
 impl FileAdapter for TarAdapter {
-    fn adapt(&self, ai: AdaptInfo, _detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox> {
+    async fn adapt(
+        &self,
+        ai: AdaptInfo,
+        _detection_reason: &FileMatcher,
+    ) -> Result<AdaptedFilesIterBox> {
        let AdaptInfo {
            filepath_hint,
            inp,
@ -103,7 +109,7 @@ mod tests {
        let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));

        let adapter = TarAdapter::new();
-        let r = loop_adapt(&adapter, d, a).context("adapt")?;
+        let r = loop_adapt(&adapter, d, a).await.context("adapt")?;
        let o = adapted_to_vec(r).await.context("adapted_to_vec")?;
        assert_eq!(
            String::from_utf8(o).context("parsing utf8")?,
--- a/src/adapters/writing.rs
+++ b/src/adapters/writing.rs
@ -3,7 +3,7 @@ use std::pin::Pin;
 use crate::{adapted_iter::one_file, join_handle_to_stream, to_io_err};

 use super::{AdaptInfo, FileAdapter, GetMetadata};
-use anyhow::Result;
+use anyhow::{Context, Result};
 use async_trait::async_trait;
 use tokio::io::{AsyncReadExt, AsyncWrite};

@ -41,15 +41,17 @@ macro_rules! async_writeln {
 }
 pub(crate) use async_writeln;

+#[async_trait]
 impl<T> FileAdapter for T
 where
    T: WritingFileAdapter,
 {
-    fn adapt(
+    async fn adapt(
        &self,
        a: super::AdaptInfo,
        detection_reason: &crate::matching::FileMatcher,
    ) -> Result<crate::adapted_iter::AdaptedFilesIterBox> {
+        let name = self.metadata().name.clone();
        let (w, r) = tokio::io::duplex(128 * 1024);
        let d2 = detection_reason.clone();
        let archive_recursion_depth = a.archive_recursion_depth + 1;
@ -59,7 +61,10 @@ where
        let config = a.config.clone();
        let joiner = tokio::spawn(async move {
            let x = d2;
-            T::adapt_write(a, &x, Box::pin(w)).await.map_err(to_io_err)
+            T::adapt_write(a, &x, Box::pin(w))
+                .await
+                .with_context(|| format!("in {}.adapt_write", name))
+                .map_err(to_io_err)
        });

        Ok(one_file(AdaptInfo {
--- a/src/adapters/zip.rs
+++ b/src/adapters/zip.rs
@ -5,7 +5,7 @@ use async_stream::stream;
 use lazy_static::lazy_static;
 use log::*;

-static EXTENSIONS: &[&str] = &["zip"];
+static EXTENSIONS: &[&str] = &["zip", "jar"];

 lazy_static! {
    static ref METADATA: AdapterMeta = AdapterMeta {
@ -36,8 +36,13 @@ impl GetMetadata for ZipAdapter {
    }
 }

+#[async_trait]
 impl FileAdapter for ZipAdapter {
-    fn adapt(&self, ai: AdaptInfo, _detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox> {
+    async fn adapt(
+        &self,
+        ai: AdaptInfo,
+        _detection_reason: &FileMatcher,
+    ) -> Result<AdaptedFilesIterBox> {
        // let (s, r) = mpsc::channel(1);
        let AdaptInfo {
            inp,
@ -52,11 +57,11 @@ impl FileAdapter for ZipAdapter {
        if is_real_file {
            use async_zip::read::fs::ZipFileReader;

-            let s = stream! {
            let zip = ZipFileReader::new(&filepath_hint).await?;
-                for i in 0..zip.entries().len() {
-                    let reader = zip.entry_reader(i).await?;
-                    let file = reader.entry();
+            let s = stream! {
+                for i in 0..zip.file().entries().len() {
+                    let file = zip.get_entry(i)?;
+                    let reader = zip.entry(i).await?;
                    if file.filename().ends_with('/') {
                        continue;
                    }
@ -98,10 +103,11 @@ impl FileAdapter for ZipAdapter {
            let mut zip = ZipFileReader::new(inp);

            let s = stream! {
-                    while !zip.finished() {
-                    if let Some(reader) = zip.entry_reader().await? {
-                        let file = reader.entry();
+                    while let Some(mut entry) = zip.next_entry().await? {
+                        let file = entry.entry();
                        if file.filename().ends_with('/') {
+                            zip = entry.skip().await?;
+
                            continue;
                        }
                        debug!(
@ -114,6 +120,7 @@ impl FileAdapter for ZipAdapter {
                        );
                        let new_line_prefix = format!("{}{}: ", line_prefix, file.filename());
                        let fname = PathBuf::from(file.filename());
+                        let reader = entry.reader();
                        tokio::pin!(reader);
                        // SAFETY: this should be solvable without unsafe but idk how :(
                        // the issue is that ZipEntryReader borrows from ZipFileReader, but we need to yield it here into the stream
@ -133,7 +140,8 @@ impl FileAdapter for ZipAdapter {
                            postprocess,
                            config: config.clone(),
                        });
-                    }
+                        zip = entry.done().await.context("going to next file in zip but entry was not read fully")?;
+
                }
            };

@ -183,7 +191,6 @@ impl<'a> AdaptedFilesIter for ZipAdaptIter<'a> {
 mod test {
    use async_zip::{write::ZipFileWriter, Compression, ZipEntryBuilder};

-
    use super::*;
    use crate::{preproc::loop_adapt, test_utils::*};
    use pretty_assertions::assert_eq;
@ -213,7 +220,7 @@ mod test {
    async fn only_seek_zip_fs() -> Result<()> {
        let zip = test_data_dir().join("only-seek-zip.zip");
        let (a, d) = simple_fs_adapt_info(&zip).await?;
-        let _v = adapted_to_vec(loop_adapt(&ZipAdapter::new(), d, a)?).await?;
+        let _v = adapted_to_vec(loop_adapt(&ZipAdapter::new(), d, a).await?).await?;
        // assert_eq!(String::from_utf8(v)?, "");

        Ok(())
@ -236,7 +243,7 @@ mod test {
            &PathBuf::from("outer.zip"),
            Box::pin(std::io::Cursor::new(zipfile)),
        );
-        let buf = adapted_to_vec(loop_adapt(&adapter, d, a)?).await?;
+        let buf = adapted_to_vec(loop_adapt(&adapter, d, a).await?).await?;

        assert_eq!(
            String::from_utf8(buf)?,
--- a/src/bin/rga-preproc.rs
+++ b/src/bin/rga-preproc.rs
@ -43,7 +43,7 @@ async fn main() -> anyhow::Result<()> {
            // happens if e.g. ripgrep detects binary data in the pipe so it cancels reading
            debug!("output cancelled (broken pipe)");
        } else {
-            Err(e).context("copying adapter output to stdout {}")?;
+            Err(e).context("copying adapter output to stdout")?;
        }
    }
    debug!("running adapter took {} total", print_dur(start));
--- a/src/caching_writer.rs
+++ b/src/caching_writer.rs
@ -1,17 +1,17 @@
-use std::pin::Pin;
+use std::{future::Future, pin::Pin};

-use anyhow::Result;
+use anyhow::{Context, Result};
 use async_compression::tokio::write::ZstdEncoder;
 use async_stream::stream;

+use crate::to_io_err;
 use log::*;
 use tokio::io::{AsyncRead, AsyncWriteExt};
 use tokio_stream::StreamExt;
 use tokio_util::io::{ReaderStream, StreamReader};

-use crate::to_io_err;
-
-type FinishHandler = dyn FnOnce((u64, Option<Vec<u8>>)) -> Result<()> + Send;
+type FinishHandler =
+    dyn FnOnce((u64, Option<Vec<u8>>)) -> Pin<Box<dyn Future<Output = Result<()>> + Send>> + Send;
 /**
 * wrap a AsyncRead so that it is passthrough,
 * but also the written data is compressed and written into a buffer,
@ -26,7 +26,7 @@ pub fn async_read_and_write_to_cache<'a>(
    let inp = Box::pin(inp);
    let mut zstd_writer = Some(ZstdEncoder::with_quality(
        Vec::new(),
-        async_compression::Level::Precise(compression_level as u32),
+        async_compression::Level::Precise(compression_level),
    ));
    let mut bytes_written = 0;

@ -64,7 +64,7 @@ pub fn async_read_and_write_to_cache<'a>(
        };

        // EOF, finish!
-        on_finish(finish)
+        on_finish(finish).await.context("write_to_cache on_finish")
            .map_err(to_io_err)?;

    };
--- a/src/config.rs
+++ b/src/config.rs
@ -108,6 +108,7 @@ impl FromStr for CacheMaxBlobLen {
    rename_all = "kebab-case",
    about = env!("CARGO_PKG_DESCRIPTION"),
    author = env!("CARGO_PKG_HOMEPAGE"),
+    long_about="rga: ripgrep, but also search in PDFs, E-Books, Office documents, zip, tar.gz, etc.",
    // TODO: long_about does not seem to work to only show this on short help
    after_help = "-h shows a concise overview, --help shows more detail and advanced options.\n\nAll other options not shown here are passed directly to rg, especially [PATTERN] and [PATH ...]",
    usage = "rga [RGA OPTIONS] [RG OPTIONS] PATTERN [PATH ...]"
@ -197,9 +198,9 @@ pub struct CacheConfig {
    /// Disable caching of results
    ///
    /// By default, rga caches the extracted text, if it is small enough,
-    /// to a database in ~/.cache/rga on Linux,
-    /// ~/Library/Caches/rga on macOS,
-    /// or C:\Users\username\AppData\Local\rga on Windows.
+    /// to a database in ${XDG_CACHE_DIR-~/.cache}/ripgrep-all on Linux,
+    /// ~/Library/Caches/ripgrep-all on macOS,
+    /// or C:\Users\username\AppData\Local\ripgrep-all on Windows.
    /// This way, repeated searches on the same set of files will be much faster.
    /// If you pass this flag, all caching will be disabled.
    #[serde(default, skip_serializing_if = "is_default")]
@ -208,7 +209,9 @@ pub struct CacheConfig {

    /// Max compressed size to cache
    ///
-    /// Longest byte length (after compression) to store in cache. Longer adapter outputs will not be cached and recomputed every time. Allowed suffixes: k M G
+    /// Longest byte length (after compression) to store in cache. Longer adapter outputs will not be cached and recomputed every time.
+    ///
+    /// Allowed suffixes on command line: k M G
    #[serde(default, skip_serializing_if = "is_default")]
    #[structopt(
        default_value,
--- a/src/preproc.rs
+++ b/src/preproc.rs
@ -3,25 +3,28 @@ use crate::adapters::*;
 use crate::caching_writer::async_read_and_write_to_cache;
 use crate::config::RgaConfig;
 use crate::matching::*;
+use crate::preproc_cache::CacheKey;
 use crate::recurse::concat_read_streams;
 use crate::{
-    preproc_cache::{LmdbCache, PreprocCache},
+    preproc_cache::{open_cache_db, PreprocCache},
    print_bytes,
 };
 use anyhow::*;
 use async_compression::tokio::bufread::ZstdDecoder;
 use async_stream::stream;
+// use futures::future::{BoxFuture, FutureExt};
 use log::*;
-use path_clean::PathClean;
 use postproc::PostprocPrefix;
+use std::future::Future;
 use std::io::Cursor;
 use std::path::Path;
+use std::pin::Pin;
 use std::sync::Arc;
 use tokio::io::AsyncBufRead;
 use tokio::io::AsyncBufReadExt;
 use tokio::io::BufReader;

-type ActiveAdapters = Vec<Arc<dyn FileAdapter>>;
+pub type ActiveAdapters = Vec<Arc<dyn FileAdapter>>;

 async fn choose_adapter(
    config: &RgaConfig,
@ -120,36 +123,6 @@ pub async fn rga_preproc(ai: AdaptInfo) -> Result<ReadBox> {
        .with_context(|| format!("run_adapter({})", &path_hint_copy.to_string_lossy()))
 }

-fn compute_cache_key(
-    filepath_hint: &Path,
-    adapter: &dyn FileAdapter,
-    active_adapters: ActiveAdapters,
-) -> Result<Vec<u8>> {
-    let clean_path = filepath_hint.to_owned().clean();
-    let meta = std::fs::metadata(filepath_hint)
-        .with_context(|| format!("reading metadata for {}", filepath_hint.to_string_lossy()))?;
-    let modified = meta.modified().expect("weird OS that can't into mtime");
-
-    if adapter.metadata().recurses {
-        let active_adapters_cache_key = active_adapters
-            .iter()
-            .map(|a| (a.metadata().name.clone(), a.metadata().version))
-            .collect::<Vec<_>>();
-        let key = (active_adapters_cache_key, clean_path, modified);
-        debug!("Cache key (with recursion): {:?}", key);
-        bincode::serialize(&key).context("could not serialize path")
-    } else {
-        let key = (
-            adapter.metadata().name.clone(),
-            adapter.metadata().version,
-            clean_path,
-            modified,
-        );
-        debug!("Cache key (no recursion): {:?}", key);
-        bincode::serialize(&key).context("could not serialize path")
-    }
-}
-
 async fn adapt_caching(
    ai: AdaptInfo,
    adapter: Arc<dyn FileAdapter>,
@ -166,41 +139,44 @@ async fn adapt_caching(
        ai.filepath_hint.to_string_lossy(),
        &meta.name
    );
-    let db_name = format!("{}.v{}", meta.name, meta.version);
    let cache_compression_level = ai.config.cache.compression_level;
    let cache_max_blob_len = ai.config.cache.max_blob_len;

-    let cache = if ai.is_real_file {
-        LmdbCache::open(&ai.config.cache)?
+    let cache = if ai.is_real_file && !ai.config.cache.disabled {
+        Some(open_cache_db(Path::new(&ai.config.cache.path.0)).await?)
    } else {
        None
    };

    let mut cache = cache.context("No cache?")?;
-    let cache_key: Vec<u8> =
-        compute_cache_key(&ai.filepath_hint, adapter.as_ref(), active_adapters)?;
+    let cache_key = CacheKey::new(&ai.filepath_hint, adapter.as_ref(), &active_adapters)?;
    // let dbg_ctx = format!("adapter {}", &adapter.metadata().name);
-    let cached = cache.get(&db_name, &cache_key)?;
+    let cached = cache.get(&cache_key).await.context("cache.get")?;
    match cached {
        Some(cached) => Ok(Box::pin(ZstdDecoder::new(Cursor::new(cached)))),
        None => {
            debug!("cache MISS, running adapter with caching...");
-            let inp = loop_adapt(adapter.as_ref(), detection_reason, ai)?;
+            let inp = loop_adapt(adapter.as_ref(), detection_reason, ai).await?;
            let inp = concat_read_streams(inp);
            let inp = async_read_and_write_to_cache(
                inp,
                cache_max_blob_len.0,
                cache_compression_level.0,
                Box::new(move |(uncompressed_size, compressed)| {
+                    Box::pin(async move {
                        debug!(
                            "uncompressed output: {}",
                            print_bytes(uncompressed_size as f64)
                        );
                        if let Some(cached) = compressed {
                            debug!("compressed output: {}", print_bytes(cached.len() as f64));
-                        cache.set(&db_name, &cache_key, &cached)?
+                            cache
+                                .set(&cache_key, cached)
+                                .await
+                                .context("writing to cache")?
                        }
                        Ok(())
+                    })
                }),
            )?;

@ -213,21 +189,34 @@ pub fn loop_adapt(
    adapter: &dyn FileAdapter,
    detection_reason: FileMatcher,
    ai: AdaptInfo,
+) -> Pin<Box<dyn Future<Output = anyhow::Result<AdaptedFilesIterBox>> + Send + '_>> {
+    Box::pin(async move { loop_adapt_inner(adapter, detection_reason, ai).await })
+}
+pub async fn loop_adapt_inner(
+    adapter: &dyn FileAdapter,
+    detection_reason: FileMatcher,
+    ai: AdaptInfo,
 ) -> anyhow::Result<AdaptedFilesIterBox> {
    let fph = ai.filepath_hint.clone();
-    let inp = adapter.adapt(ai, &detection_reason).with_context(|| {
+    let inp = adapter.adapt(ai, &detection_reason).await;
+    let inp = if adapter.metadata().name == "postprocprefix" {
+        // don't add confusing error context
+        inp?
+    } else {
+        inp.with_context(|| {
            format!(
                "adapting {} via {} failed",
                fph.to_string_lossy(),
                adapter.metadata().name
            )
-    })?;
+        })?
+    };
    let s = stream! {
        for await file in inp {
            match buf_choose_adapter(file?).await? {
                Ret::Recurse(ai, adapter, detection_reason, _active_adapters) => {
                    if ai.archive_recursion_depth >= ai.config.max_archive_recursion.0 {
-                        let s = format!("{}[rga: max archive recursion reached ({})]", ai.line_prefix, ai.archive_recursion_depth).into_bytes();
+                        let s = format!("{}[rga: max archive recursion reached ({})]\n", ai.line_prefix, ai.archive_recursion_depth).into_bytes();
                        yield Ok(AdaptInfo {
                            inp: Box::pin(Cursor::new(s)),
                            ..ai
@ -243,7 +232,7 @@ pub fn loop_adapt(
                        ai.filepath_hint.to_string_lossy(),
                        &adapter.metadata().name
                    );
-                    for await ifile in loop_adapt(adapter.as_ref(), detection_reason, ai)? {
+                    for await ifile in loop_adapt(adapter.as_ref(), detection_reason, ai).await? {
                        yield ifile;
                    }
                }
--- a/src/preproc_cache.rs
+++ b/src/preproc_cache.rs
@ -1,135 +1,188 @@
-use crate::{config::CacheConfig, print_bytes, print_dur};
-use anyhow::{format_err, Context, Result};
-use log::*;
-use rkv::backend::{BackendEnvironmentBuilder, LmdbEnvironment};
-use std::{fmt::Display, path::Path, time::Instant};
+use crate::{adapters::FileAdapter, preproc::ActiveAdapters};
+use anyhow::{Context, Result};
+use path_clean::PathClean;
+use rusqlite::{named_params, OptionalExtension};
+use std::{path::Path, time::UNIX_EPOCH};
+use tokio_rusqlite::Connection;

-pub trait PreprocCache: Send + Sync {
-    /*/// gets cache at specified key.
-    /// if cache hit, return the resulting data
-    /// else, run the given lambda, and store its result in the cache if present
-    fn get_or_run<'a>(
-        &mut self,
-        db_name: &str,
-        key: &[u8],
-        debug_name: &str,
-        runner: Box<dyn FnOnce() -> Result<Option<Vec<u8>>> + 'a>,
-    ) -> Result<Option<Vec<u8>>>;*/
-
-    fn get(&self, db_name: &str, key: &[u8]) -> Result<Option<Vec<u8>>>;
-    fn set(&mut self, db_name: &str, key: &[u8], value: &[u8]) -> Result<()>;
+#[derive(Clone)]
+pub struct CacheKey {
+    adapter: String,
+    adapter_version: i32,
+    active_adapters: String,
+    file_path: String,
+    file_mtime_unix_ms: i64,
 }
-
-/// opens a LMDB cache
-fn open_cache_db(
-    path: &Path,
-) -> Result<std::sync::Arc<std::sync::RwLock<rkv::Rkv<LmdbEnvironment>>>> {
-    std::fs::create_dir_all(path)?;
-    // use rkv::backend::LmdbEnvironmentFlags;
-
-    rkv::Manager::<LmdbEnvironment>::singleton()
-        .write()
-        .map_err(|_| format_err!("could not write cache db manager"))?
-        .get_or_create(path, |p| {
-            let mut builder = rkv::Rkv::environment_builder::<rkv::backend::Lmdb>();
-            builder
-                .set_flags(rkv::EnvironmentFlags::NO_SYNC)
-                .set_flags(rkv::EnvironmentFlags::WRITE_MAP) // not durable cuz it's a cache
-                // i'm not sure why NO_TLS is needed. otherwise LMDB transactions (open readers) will keep piling up until it fails with
-                // LmdbError(ReadersFull). Those "open readers" stay even after the corresponding processes exit.
-                // hope setting this doesn't break integrity
-                .set_flags(rkv::EnvironmentFlags::NO_TLS)
-                // sometimes, this seems to cause the data.mdb file to appear as 2GB in size (with holes), but sometimes not?
-                .set_map_size(2 * 1024 * 1024 * 1024)
-                .set_max_dbs(100)
-                .set_max_readers(128);
-            rkv::Rkv::from_builder(p, builder)
+impl CacheKey {
+    pub fn new(
+        filepath_hint: &Path,
+        adapter: &dyn FileAdapter,
+        active_adapters: &ActiveAdapters,
+    ) -> Result<CacheKey> {
+        let meta = std::fs::metadata(filepath_hint)
+            .with_context(|| format!("reading metadata for {}", filepath_hint.to_string_lossy()))?;
+        let modified = meta.modified().expect("weird OS that can't into mtime");
+        let file_mtime_unix_ms = modified.duration_since(UNIX_EPOCH)?.as_millis() as i64;
+        let active_adapters = if adapter.metadata().recurses {
+            serde_json::to_string(
+                &active_adapters
+                    .iter()
+                    .map(|a| format!("{}.v{}", a.metadata().name, a.metadata().version))
+                    .collect::<Vec<_>>(),
+            )?
+        } else {
+            "null".to_string()
+        };
+        Ok(CacheKey {
+            adapter: adapter.metadata().name.clone(),
+            adapter_version: adapter.metadata().version,
+            file_path: filepath_hint.clean().to_string_lossy().to_string(),
+            file_mtime_unix_ms,
+            active_adapters,
        })
-        .map_err(|e| format_err!("could not get/create cache db: {}", e))
-}
-
-pub struct LmdbCache {
-    db_arc: std::sync::Arc<std::sync::RwLock<rkv::Rkv<LmdbEnvironment>>>,
-}
-
-impl LmdbCache {
-    pub fn open(config: &CacheConfig) -> Result<Option<LmdbCache>> {
-        if config.disabled {
-            return Ok(None);
-        }
-        let path = Path::new(&config.path.0);
-        Ok(Some(LmdbCache {
-            db_arc: open_cache_db(path)?,
-        }))
    }
 }

-#[derive(Debug)]
-struct RkvErrWrap(rkv::StoreError);
-impl Display for RkvErrWrap {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        self.0.fmt(f)
+#[async_trait::async_trait]
+pub trait PreprocCache {
+    async fn get(&self, key: &CacheKey) -> Result<Option<Vec<u8>>>;
+    async fn set(&mut self, key: &CacheKey, value: Vec<u8>) -> Result<()>;
+}
+
+async fn connect_pragmas(db: &Connection) -> Result<()> {
+    // https://phiresky.github.io/blog/2020/sqlite-performance-tuning/
+    //let want_page_size = 32768;
+    //db.execute(&format!("pragma page_size = {};", want_page_size))
+    //    .context("setup pragma 1")?;
+    db.call(|db| {
+        db.execute_batch(
+            "
+    pragma journal_mode = WAL;
+    pragma foreign_keys = on;
+    pragma temp_store = memory;
+    pragma synchronous = off; -- integrity isn't very important here
+    pragma mmap_size = 30000000000;
+
+    create table if not exists preproc_cache (
+        adapter text not null,
+        adapter_version integer not null,
+        created_unix_ms integer not null default (unixepoch() * 1000),
+        active_adapters text not null, -- 'null' if adapter cannot recurse
+        file_path text not null,
+        file_mtime_unix_ms integer not null,
+        text_content_zstd blob not null
+    ) strict;
+    
+    create unique index if not exists preproc_cache_idx on preproc_cache (adapter, adapter_version, file_path, active_adapters);
+    ",
+        )
+    })
+    .await.context("connect_pragmas")?;
+    let jm: i64 = db
+        .call(|db| db.pragma_query_value(None, "application_id", |r| r.get(0)))
+        .await?;
+    if jm != 924716026 {
+        // (probably) newly created db
+        create_pragmas(db).await.context("create_pragmas")?;
+    }
+    Ok(())
+}
+
+async fn create_pragmas(db: &Connection) -> Result<()> {
+    db.call(|db| {
+        db.execute_batch(
+            "
+        pragma application_id = 924716026;
+        pragma user_version = 2; -- todo: on upgrade clear db if version is unexpected
+        ",
+        )
+    })
+    .await?;
+    Ok(())
+}
+struct SqliteCache {
+    db: Connection,
+}
+impl SqliteCache {
+    async fn new(path: &Path) -> Result<SqliteCache> {
+        let db = Connection::open(path.join("cache.sqlite3")).await?;
+        connect_pragmas(&db).await?;
+
+        Ok(SqliteCache { db })
    }
 }
-impl std::error::Error for RkvErrWrap {}

-impl PreprocCache for LmdbCache {
-    fn get(&self, db_name: &str, key: &[u8]) -> Result<Option<Vec<u8>>> {
-        let start = Instant::now();
-        let db_env = self
-            .db_arc
-            .read()
-            .map_err(|_| anyhow::anyhow!("Could not open lock, some lock writer panicked"))?;
-        let db = db_env
-            .open_single(db_name, rkv::store::Options::create())
-            .map_err(RkvErrWrap)
-            .context("could not open cache db store")?;
-
-        let reader = db_env.read().expect("could not get reader");
-        let cached = db
-            .get(&reader, key)
-            .map_err(RkvErrWrap)
-            .context("could not read from db")?;
-
-        match cached {
-            Some(rkv::Value::Blob(cached)) => {
-                debug!(
-                    "cache HIT, reading {} (compressed) from cache",
-                    print_bytes(cached.len() as f64)
-                );
-                debug!("reading from cache took {}", print_dur(start));
-                Ok(Some(Vec::from(cached)))
+#[async_trait::async_trait]
+impl PreprocCache for SqliteCache {
+    async fn get(&self, key: &CacheKey) -> Result<Option<Vec<u8>>> {
+        let key = (*key).clone(); // todo: without cloning
+        Ok(self
+            .db
+            .call(move |db| {
+                db.query_row(
+                    "select text_content_zstd from preproc_cache where
+                            adapter = :adapter
+                        and adapter_version = :adapter_version
+                        and active_adapters = :active_adapters
+                        and file_path = :file_path
+                        and file_mtime_unix_ms = :file_mtime_unix_ms
+                ",
+                    named_params! {
+                        ":adapter": &key.adapter,
+                        ":adapter_version": &key.adapter_version,
+                        ":active_adapters": &key.active_adapters,
+                        ":file_path": &key.file_path,
+                        ":file_mtime_unix_ms": &key.file_mtime_unix_ms
+                    },
+                    |r| r.get::<_, Vec<u8>>(0),
+                )
+                .optional()
+            })
+            .await
+            .context("reading from cache")?)
    }
-            Some(_) => Err(format_err!("Integrity: value not blob"))?,
-            None => Ok(None),
+
+    async fn set(&mut self, key: &CacheKey, value: Vec<u8>) -> Result<()> {
+        let key = (*key).clone(); // todo: without cloning
+        Ok(self
+            .db
+            .call(move |db| {
+                db.execute(
+                    "insert into preproc_cache (adapter, adapter_version, active_adapters, file_path, file_mtime_unix_ms, text_content_zstd) values
+                        (:adapter, :adapter_version, :active_adapters, :file_path, :file_mtime_unix_ms, :text_content_zstd)
+                    on conflict (adapter, adapter_version, active_adapters, file_path) do update set
+                        file_mtime_unix_ms = :file_mtime_unix_ms,
+                        created_unix_ms = unixepoch() * 1000,
+                        text_content_zstd = :text_content_zstd",
+                    named_params! {
+                        ":adapter": &key.adapter,
+                        ":adapter_version": &key.adapter_version,
+                        ":active_adapters": &key.active_adapters,
+                        ":file_path": &key.file_path,
+                        ":file_mtime_unix_ms": &key.file_mtime_unix_ms,
+                        ":text_content_zstd": value
                    }
+                ).map(|_| ())
+            })
+            .await?)
    }
-    fn set(&mut self, db_name: &str, key: &[u8], got: &[u8]) -> Result<()> {
-        let start = Instant::now();
-        debug!("writing {} to cache", print_bytes(got.len() as f64));
-        let db_env = self
-            .db_arc
-            .read()
-            .map_err(|_| anyhow::anyhow!("Could not open lock, some lock writer panicked"))?;
+}
+/// opens a default cache
+pub async fn open_cache_db(path: &Path) -> Result<impl PreprocCache> {
+    std::fs::create_dir_all(path)?;
+    SqliteCache::new(path).await
+}

-        let db = db_env
-            .open_single(db_name, rkv::store::Options::create())
-            .map_err(RkvErrWrap)
-            .context("could not open cache db store")?;
+#[cfg(test)]
+mod test {

-        let mut writer = db_env
-            .write()
-            .map_err(RkvErrWrap)
-            .with_context(|| format_err!("could not open write handle to cache"))?;
+    use crate::preproc_cache::*;

-        db.put(&mut writer, key, &rkv::Value::Blob(got))
-            .map_err(RkvErrWrap)
-            .with_context(|| format_err!("could not write to cache"))?;
-        writer
-            .commit()
-            .map_err(RkvErrWrap)
-            .with_context(|| "could not write cache".to_string())?;
-        debug!("writing to cache took {}", print_dur(start));
+    #[tokio::test]
+    async fn test_read_write() -> anyhow::Result<()> {
+        let path = tempfile::tempdir()?;
+        let _db = open_cache_db(&path.path().join("foo.sqlite3")).await?;
+        // db.set();
        Ok(())
    }
 }