Merge remote-tracking branch 'upstream/master' into mbox-extractor

This commit is contained in:
FliegendeWurst 2023-07-31 14:34:18 +02:00
commit 2259730c67
26 changed files with 1358 additions and 1298 deletions

27
.github/ISSUE_TEMPLATE/bug_report.md vendored Normal file
View File

@ -0,0 +1,27 @@
---
name: Bug report
about: Create a report to help us improve
title: ''
labels: bug
assignees: ''
---
**Describe the bug**
**To Reproduce**
Attach example file:
Run command:
**Output**
**Screenshots**
If applicable, add screenshots to help explain your problem.
**Operating System and Version**
**Output of `rga --version`**

View File

@ -0,0 +1,20 @@
---
name: Feature request
about: Suggest an idea for this project
title: ''
labels: ''
assignees: ''
---
**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
**Describe the solution you'd like**
A clear and concise description of what you want to happen.
**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.
**Additional context**
Add any other context or screenshots about the feature request here.

View File

@ -1,75 +1,25 @@
# Based on https://github.com/actions-rs/meta/blob/master/recipes/quickstart.md # Based on https://github.com/actions-rs/meta/blob/master/recipes/quickstart.md
# #
# While our "example" application has the platform-specific code, # While our "example" application has platform-specific code,
# for simplicity we are compiling and testing everything on the Ubuntu environment only. # for simplicity we are compiling and testing everything in a nix-on-Linux environment only.
# For multi-OS testing see the `cross.yml` workflow.
on: [push, pull_request] on: [push, pull_request]
name: ci name: ci
jobs: jobs:
check: nix-flake-check:
name: Check name: nix flake check
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout sources - name: Checkout sources
uses: actions/checkout@v2 uses: actions/checkout@v3
- name: Install stable toolchain - name: Install nix
uses: actions-rs/toolchain@v1 uses: cachix/install-nix-action@v21
with:
profile: minimal
toolchain: stable
override: true
- name: Run cargo check - name: Ensure the build succeeds
uses: actions-rs/cargo@v1 run: nix build
with:
command: check
test: - name: Run `nix flake check` to run formatters, linters, and tests
name: Test Suite run: nix flake check --print-build-logs
runs-on: ubuntu-latest
steps:
- name: Checkout sources
uses: actions/checkout@v2
- name: Install stable toolchain
uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
- name: Run cargo test
uses: actions-rs/cargo@v1
with:
command: test
lints:
name: Lints
runs-on: ubuntu-latest
steps:
- name: Checkout sources
uses: actions/checkout@v2
- name: Install stable toolchain
uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
components: rustfmt, clippy
- name: Run cargo fmt
uses: actions-rs/cargo@v1
with:
command: fmt
args: --all -- --check
- name: Run cargo clippy
uses: actions-rs/cargo@v1
with:
command: clippy
args: -- -D warnings

View File

@ -18,7 +18,7 @@ on:
# branches: # branches:
# - ag/release # - ag/release
tags: tags:
- 'v[0-9]+.[0-9]+.[0-9]+' - "v[0-9]+.[0-9]+.[0-9]+*"
jobs: jobs:
create-release: create-release:
name: create-release name: create-release
@ -62,7 +62,7 @@ jobs:
build-release: build-release:
name: build-release name: build-release
needs: ['create-release'] needs: ["create-release"]
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
env: env:
# For some builds, we use cross to test on 32-bit and big-endian # For some builds, we use cross to test on 32-bit and big-endian
@ -79,11 +79,11 @@ jobs:
build: [linux, linux-arm, macos, win-msvc] build: [linux, linux-arm, macos, win-msvc]
include: include:
- build: linux - build: linux
os: ubuntu-18.04 os: ubuntu-22.04
rust: nightly rust: nightly
target: x86_64-unknown-linux-musl target: x86_64-unknown-linux-musl
- build: linux-arm - build: linux-arm
os: ubuntu-18.04 os: ubuntu-22.04
rust: nightly rust: nightly
target: arm-unknown-linux-gnueabihf target: arm-unknown-linux-gnueabihf
- build: macos - build: macos
@ -106,7 +106,7 @@ jobs:
fetch-depth: 1 fetch-depth: 1
- name: Install packages (Ubuntu) - name: Install packages (Ubuntu)
if: matrix.os == 'ubuntu-18.04' if: matrix.os == 'ubuntu-22.04'
run: | run: |
ci/ubuntu-install-packages ci/ubuntu-install-packages
@ -124,7 +124,7 @@ jobs:
target: ${{ matrix.target }} target: ${{ matrix.target }}
- name: Use Cross - name: Use Cross
# if: matrix.os != 'windows-2019' shell: bash
run: | run: |
cargo install cross cargo install cross
echo "CARGO=cross" >> $GITHUB_ENV echo "CARGO=cross" >> $GITHUB_ENV

1155
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -2,7 +2,7 @@
[package] [package]
authors = ["phiresky <phireskyde+git@gmail.com>"] authors = ["phiresky <phireskyde+git@gmail.com>"]
description = "rga: ripgrep, but also search in PDFs, E-Books, Office documents, zip, tar.gz, etc." description = "rga: ripgrep, but also search in PDFs, E-Books, Office documents, zip, tar.gz, etc."
edition = "2018" edition = "2021"
exclude = [ exclude = [
"exampledir/*", "exampledir/*",
] ]
@ -11,56 +11,57 @@ license = "AGPL-3.0-or-later"
name = "ripgrep_all" name = "ripgrep_all"
readme = "README.md" readme = "README.md"
repository = "https://github.com/phiresky/ripgrep-all" repository = "https://github.com/phiresky/ripgrep-all"
version = "1.0.0-alpha.2" version = "1.0.0-alpha.5"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
anyhow = "1.0.32" anyhow = {version = "1.0.71", features = ["backtrace"]}
async-compression = {version = "0.3.15", features = ["all", "all-algorithms", "tokio"]} async-compression = { version = "0.4.0", features = ["all", "all-algorithms", "tokio"] }
async-stream = "0.3.3" async-stream = "0.3.5"
async-trait = "0.1.64" async-trait = "0.1.68"
async_zip = "0.0.9" async_zip = {version = "0.0.12", features = ["full"]}
bincode = "1.3.1" bincode = "1.3.3"
bytes = "1.2.1" bytes = "1.4.0"
clap = {version = "4.0.18", features = ["wrap_help"]} clap = {version = "4.3.0", features = ["wrap_help"]}
crossbeam = "0.8.1" crossbeam = "0.8.2"
crossbeam-channel = "0.5.1" crossbeam-channel = "0.5.8"
derive_more = "0.99.9" derive_more = "0.99.17"
directories-next = "2.0.0" directories-next = "2.0.0"
dyn-clonable = "0.9.0" dyn-clonable = "0.9.0"
dyn-clone = "1.0.2" dyn-clone = "1.0.11"
encoding_rs = "0.8.24" encoding_rs = "0.8.32"
encoding_rs_io = "0.1.7" encoding_rs_io = "0.1.7"
env_logger = "0.9.0" env_logger = "0.10.0"
glob = "0.3.0" glob = "0.3.1"
json_comments = "0.2.1" json_comments = "0.2.1"
lazy_static = "1.4.0" lazy_static = "1.4.0"
log = "0.4.11" log = "0.4.17"
mailbox = "0.2.0" mailbox = "0.2.0"
mailparse = "0.14.0" mailparse = "0.14.0"
memchr = "2.3.3" memchr = "2.5.0"
mime2ext = "0.1.52" mime2ext = "0.1.52"
paste = "1.0.0" paste = "1.0.12"
path-clean = "0.1.0" path-clean = "1.0.1"
pretty-bytes = "0.2.2" pretty-bytes = "0.2.2"
regex = "1.3.9" regex = "1.8.2"
rkv = "0.17" rusqlite = {version = "0.29.0", features = ["vtab", "bundled"]}
rusqlite = {version = "0.28.0", features = ["vtab", "bundled"]} schemars = {version = "0.8.12", features = ["preserve_order"]}
schemars = {version = "0.8.0-alpha-4", features = ["preserve_order"]} serde = {version = "1.0.163", features = ["derive"]}
serde = {version = "1.0.115", features = ["derive"]} serde_json = "1.0.96"
serde_json = "1.0.57"
size_format = "1.0.2" size_format = "1.0.2"
structopt = "0.3.17" structopt = "0.3.26"
tempfile = "3.1.0" tempfile = "3.5.0"
tokio = {version = "1.21.2", features = ["full"]} tokio = {version = "1.28.1", features = ["full"]}
tokio-stream = {version = "0.1.11", features = ["io-util", "tokio-util"]} tokio-rusqlite = "0.4.0"
tokio-stream = {version = "0.1.14", features = ["io-util", "tokio-util"]}
tokio-tar = { git = "https://github.com/vorot93/tokio-tar", version = "0.3.0" } tokio-tar = { git = "https://github.com/vorot93/tokio-tar", version = "0.3.0" }
tokio-util = {version = "0.7.4", features = ["io", "full"]} tokio-util = {version = "0.7.8", features = ["io", "full"]}
tree_magic = {package = "tree_magic_mini", version = "3.0.0"} tree_magic = {package = "tree_magic_mini", version = "3.0.3"}
[dev-dependencies] [dev-dependencies]
async-recursion = "1.0.0" async-recursion = "1.0.4"
ctor = "0.1.20" ctor = "0.2.0"
pretty_assertions = "1.3.0" pretty_assertions = "1.3.0"
tempfile = "3.5.0"
tokio-test = "0.4.2" tokio-test = "0.4.2"

155
README.md
View File

@ -33,45 +33,7 @@ demo/
![rga-fzf](doc/rga-fzf.gif) ![rga-fzf](doc/rga-fzf.gif)
You can use rga interactively via fzf. Add the following to your ~/.{bash,zsh}rc: See [the wiki](https://github.com/phiresky/ripgrep-all/wiki/fzf-Integration) for instructions of integrating rga with fzf.
```bash
rga-fzf() {
RG_PREFIX="rga --files-with-matches"
local file
file="$(
FZF_DEFAULT_COMMAND="$RG_PREFIX '$1'" \
fzf --sort --preview="[[ ! -z {} ]] && rga --pretty --context 5 {q} {}" \
--phony -q "$1" \
--bind "change:reload:$RG_PREFIX {q}" \
--preview-window="70%:wrap"
)" &&
echo "opening $file" &&
xdg-open "$file"
}
```
And for your `~/.config/fish/config.fish`:
```
function rga-fzf
set RG_PREFIX 'rga --files-with-matches'
if test (count $argv) -gt 1
set RG_PREFIX "$RG_PREFIX $argv[1..-2]"
end
set -l file $file
set file (
FZF_DEFAULT_COMMAND="$RG_PREFIX '$argv[-1]'" \
fzf --sort \
--preview='test ! -z {} && \
rga --pretty --context 5 {q} {}' \
--phony -q "$argv[-1]" \
--bind "change:reload:$RG_PREFIX {q}" \
--preview-window='50%:wrap'
) && \
echo "opening $file" && \
open "$file"
end
```
## INSTALLATION ## INSTALLATION
@ -86,9 +48,11 @@ Linux x64, macOS and Windows binaries are available [in GitHub Releases][latestr
`pacman -S ripgrep-all`. `pacman -S ripgrep-all`.
#### Nix #### Nix
`nix-env -iA nixpkgs.ripgrep-all` `nix-env -iA nixpkgs.ripgrep-all`
#### Debian-based #### Debian-based
download the [rga binary][latestrelease] and get the dependencies like this: download the [rga binary][latestrelease] and get the dependencies like this:
`apt install ripgrep pandoc poppler-utils ffmpeg` `apt install ripgrep pandoc poppler-utils ffmpeg`
@ -117,7 +81,7 @@ If you get an error like `VCRUNTIME140.DLL could not be found`, you need to inst
To install the dependencies that are each not strictly necessary but very useful: To install the dependencies that are each not strictly necessary but very useful:
`brew install pandoc poppler tesseract ffmpeg` `brew install pandoc poppler ffmpeg`
### Compile from source ### Compile from source
@ -131,58 +95,58 @@ rga should compile with stable Rust (v1.36.0+, check with `rustc --version`). To
## Available Adapters ## Available Adapters
rga works with _adapters_ that adapt various file formats. It comes with a few adapters integrated:
``` ```
rga --rga-list-adapters rga --rga-list-adapters
``` ```
You can also add **custom adapters**. See [the wiki](https://github.com/phiresky/ripgrep-all/wiki) for more information.
<!-- this part generated by update-readme.sh --> <!-- this part generated by update-readme.sh -->
Adapters: Adapters:
- **ffmpeg** - **pandoc**
Uses ffmpeg to extract video metadata/chapters and subtitles. Uses pandoc to convert binary/unreadable text documents to plain markdown-like text
Extensions: `.mkv`, `.mp4`, `.avi` Runs: pandoc --from= --to=plain --wrap=none --markdown-headings=atx
Extensions: .epub, .odt, .docx, .fb2, .ipynb
* **pandoc**
Uses pandoc to convert binary/unreadable text documents to plain markdown-like text.
Extensions: `.epub`, `.odt`, `.docx`, `.fb2`, `.ipynb`
- **poppler** - **poppler**
Uses pdftotext (from poppler-utils) to extract plain text from PDF files. Uses pdftotext (from poppler-utils) to extract plain text from PDF files
Extensions: `.pdf` Runs: pdftotext - -
Mime Types: `application/pdf` Extensions: .pdf
Mime Types: application/pdf
- **postprocpagebreaks**
Adds the page number to each line for an input file that specifies page breaks as ascii page break character.
Mainly to be used internally by the poppler adapter.
Extensions: .asciipagebreaks
- **ffmpeg**
Uses ffmpeg to extract video metadata/chapters, subtitles, lyrics, and other metadata
Extensions: .mkv, .mp4, .avi, .mp3, .ogg, .flac, .webm
- **zip** - **zip**
Reads a zip file as a stream and recurses down into its contents. Reads a zip file as a stream and recurses down into its contents
Extensions: `.zip` Extensions: .zip, .jar
Mime Types: `application/zip` Mime Types: application/zip
- **decompress** - **decompress**
Reads compressed file as a stream and runs a different extractor on the contents. Reads compressed file as a stream and runs a different extractor on the contents.
Extensions: `.tgz`, `.tbz`, `.tbz2`, `.gz`, `.bz2`, `.xz`, `.zst` Extensions: .tgz, .tbz, .tbz2, .gz, .bz2, .xz, .zst
Mime Types: `application/gzip`, `application/x-bzip`, `application/x-xz`, `application/zstd` Mime Types: application/gzip, application/x-bzip, application/x-xz, application/zstd
- **tar** - **tar**
Reads a tar file as a stream and recurses down into its contents. Reads a tar file as a stream and recurses down into its contents
Extensions: `.tar` Extensions: .tar
* **sqlite** - **sqlite**
Uses sqlite bindings to convert sqlite databases into a simple plain text format. Uses sqlite bindings to convert sqlite databases into a simple plain text format
Extensions: `.db`, `.db3`, `.sqlite`, `.sqlite3` Extensions: .db, .db3, .sqlite, .sqlite3
Mime Types: `application/x-sqlite3` Mime Types: application/x-sqlite3
The following adapters are disabled by default, and can be enabled using `--rga-adapters=+pdfpages,tesseract`: The following adapters are disabled by default, and can be enabled using '--rga-adapters=+foo,bar':
- **pdfpages**
Converts a pdf to its individual pages as png files. Only useful in combination with tesseract.
Extensions: `.pdf`
Mime Types: `application/pdf`
- **tesseract**
Uses tesseract to run OCR on images to make them searchable.
May need `-j1` to prevent overloading the system.
Make sure you have tesseract installed.
Extensions: `.jpg`, `.png`
## USAGE: ## USAGE:
@ -202,6 +166,17 @@ The following adapters are disabled by default, and can be enabled using `--rga-
> Detection is only done on the first 8KiB of the file, since we can\'t > Detection is only done on the first 8KiB of the file, since we can\'t
> always seek on the input (in archives). > always seek on the input (in archives).
**\--rga-no-cache**
> Disable caching of results
>
> By default, rga caches the extracted text, if it is small enough, to a
> database in \${XDG*CACHE_DIR-\~/.cache}/ripgrep-all on Linux,
> *\~/Library/Caches/ripgrep-all\_ on macOS, or
> C:\\Users\\username\\AppData\\Local\\ripgrep-all on Windows. This way,
> repeated searches on the same set of files will be much faster. If you
> pass this flag, all caching will be disabled.
**-h**, **\--help** **-h**, **\--help**
> Prints help information > Prints help information
@ -210,15 +185,9 @@ The following adapters are disabled by default, and can be enabled using `--rga-
> List all known adapters > List all known adapters
**\--rga-no-cache** **\--rga-print-config-schema**
> Disable caching of results > Print the JSON Schema of the configuration file
>
> By default, rga caches the extracted text, if it is small enough, to a
> database in \~/.cache/rga on Linux, _\~/Library/Caches/rga_ on macOS,
> or C:\\Users\\username\\AppData\\Local\\rga on Windows. This way,
> repeated searches on the same set of files will be much faster. If you
> pass this flag, all caching will be disabled.
**\--rg-help** **\--rg-help**
@ -242,25 +211,32 @@ The following adapters are disabled by default, and can be enabled using `--rga-
> use all default adapters except for bar and baz. \"+bar,baz\" means > use all default adapters except for bar and baz. \"+bar,baz\" means
> use all default adapters and also bar and baz. > use all default adapters and also bar and baz.
**\--rga-cache-compression-level=**\<cache-compression-level\> **\--rga-cache-compression-level=**\<compression-level\>
> ZSTD compression level to apply to adapter outputs before storing in > ZSTD compression level to apply to adapter outputs before storing in
> cache db > cache db
> >
> Ranges from 1 - 22 \[default: 12\] > Ranges from 1 - 22 \[default: 12\]
**\--rga-cache-max-blob-len=**\<cache-max-blob-len\> **\--rga-config-file=**\<config-file-path\>
> Max compressed size to cache
>
> Longest byte length (after compression) to store in cache. Longer
> adapter outputs will not be cached and recomputed every time. Allowed
> suffixes: k M G \[default: 2000000\]
**\--rga-max-archive-recursion=**\<max-archive-recursion\> **\--rga-max-archive-recursion=**\<max-archive-recursion\>
> Maximum nestedness of archives to recurse into \[default: 4\] > Maximum nestedness of archives to recurse into \[default: 4\]
**\--rga-cache-max-blob-len=**\<max-blob-len\>
> Max compressed size to cache
>
> Longest byte length (after compression) to store in cache. Longer
> adapter outputs will not be cached and recomputed every time.
>
> Allowed suffixes on command line: k M G \[default: 2000000\]
**\--rga-cache-path=**\<path\>
> Path to store cache db \[default: /home/phire/.cache/ripgrep-all\]
**-h** shows a concise overview, **\--help** shows more detail and **-h** shows a concise overview, **\--help** shows more detail and
advanced options. advanced options.
@ -287,6 +263,7 @@ to debug the adapters.
You can use the provided [`flake.nix`](./flake.nix) to setup all build- and You can use the provided [`flake.nix`](./flake.nix) to setup all build- and
run-time dependencies: run-time dependencies:
1. Enable [Flakes](https://nixos.wiki/wiki/Flakes) in your Nix configuration. 1. Enable [Flakes](https://nixos.wiki/wiki/Flakes) in your Nix configuration.
1. Add [`direnv`](https://direnv.net/) to your profile: 1. Add [`direnv`](https://direnv.net/) to your profile:
`nix profile install nixpkgs#direnv` `nix profile install nixpkgs#direnv`

View File

@ -7,7 +7,7 @@
// https://github.com/phiresky/ripgrep-all/blob/master/doc/config.default.jsonc // https://github.com/phiresky/ripgrep-all/blob/master/doc/config.default.jsonc
// The config options are the same as the command line options, // The config options are the same as the command line options,
// but with --rga- prefix removed and - replaced with _. // but with --rga- prefix removed and - and . replaced with _.
// e.g. --rga-no-cache becomes `"no_cache": true. // e.g. --rga-no-cache becomes `"no_cache": true.
// The only exception is the `custom_adapters` option, which can only be set in this file. // The only exception is the `custom_adapters` option, which can only be set in this file.

View File

@ -5,7 +5,7 @@ content=$(
<!-- this part generated by update-readme.sh --> <!-- this part generated by update-readme.sh -->
$(cargo run --bin rga -- --rga-list-adapters) $(cargo run --bin rga -- --rga-list-adapters)
$(help2man -N "cargo run --bin rga --" | pandoc -f man -t markdown --atx-headers | rg --multiline "## USAGE:(.|\n)*") $(help2man -N "cargo run --bin rga --" | pandoc -f man -t markdown --markdown-headings=atx | rg --multiline "## USAGE:(.|\n)*")
<!-- end of part generated by update-readme.sh --> <!-- end of part generated by update-readme.sh -->
END END
) )

BIN
exampledir/exif.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.9 MiB

View File

@ -3,11 +3,11 @@
"advisory-db": { "advisory-db": {
"flake": false, "flake": false,
"locked": { "locked": {
"lastModified": 1670452192, "lastModified": 1685821301,
"narHash": "sha256-f8NIFbqSbCzpssgDUK4srfgKaVaMhDScEptw4uuxGAc=", "narHash": "sha256-4XRcnSboLJw1XKjDpg2jBU70jEw/8Bgx4nUmnq3kXbY=",
"owner": "rustsec", "owner": "rustsec",
"repo": "advisory-db", "repo": "advisory-db",
"rev": "0a2faeb87195392b23333a8097309d29f2c5d31d", "rev": "af3f3d503f82056785841bee49997bae65eba1c0",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -26,11 +26,11 @@
"rust-overlay": "rust-overlay" "rust-overlay": "rust-overlay"
}, },
"locked": { "locked": {
"lastModified": 1670546681, "lastModified": 1684981077,
"narHash": "sha256-S33bhME0zPHPEZyZPCsrdQL/4WW/A020PwN+a3z7Q+I=", "narHash": "sha256-68X9cFm0RTZm8u0rXPbeBzOVUH5OoUGAfeHHVoxGd9o=",
"owner": "ipetkov", "owner": "ipetkov",
"repo": "crane", "repo": "crane",
"rev": "63f80ee278897e72a1468090278716b5befa5128", "rev": "35110cccf28823320f4fd697fcafcb5038683982",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -42,11 +42,11 @@
"flake-compat": { "flake-compat": {
"flake": false, "flake": false,
"locked": { "locked": {
"lastModified": 1668681692, "lastModified": 1673956053,
"narHash": "sha256-Ht91NGdewz8IQLtWZ9LCeNXMSXHUss+9COoqu6JLmXU=", "narHash": "sha256-4gtG9iQuiKITOjNQQeQIpoIB6b16fm+504Ch3sNKLd8=",
"owner": "edolstra", "owner": "edolstra",
"repo": "flake-compat", "repo": "flake-compat",
"rev": "009399224d5e398d03b22badca40a37ac85412a1", "rev": "35bb57c0c8d8b62bbfd284272c928ceb64ddbde9",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -58,11 +58,11 @@
"flake-compat_2": { "flake-compat_2": {
"flake": false, "flake": false,
"locked": { "locked": {
"lastModified": 1668681692, "lastModified": 1673956053,
"narHash": "sha256-Ht91NGdewz8IQLtWZ9LCeNXMSXHUss+9COoqu6JLmXU=", "narHash": "sha256-4gtG9iQuiKITOjNQQeQIpoIB6b16fm+504Ch3sNKLd8=",
"owner": "edolstra", "owner": "edolstra",
"repo": "flake-compat", "repo": "flake-compat",
"rev": "009399224d5e398d03b22badca40a37ac85412a1", "rev": "35bb57c0c8d8b62bbfd284272c928ceb64ddbde9",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -72,12 +72,15 @@
} }
}, },
"flake-utils": { "flake-utils": {
"inputs": {
"systems": "systems"
},
"locked": { "locked": {
"lastModified": 1667395993, "lastModified": 1681202837,
"narHash": "sha256-nuEHfE/LcWyuSWnS8t12N1wc105Qtau+/OdUAjtQ0rA=", "narHash": "sha256-H+Rh19JDwRtpVPAWp64F+rlEtxUWBAQW28eAi3SRSzg=",
"owner": "numtide", "owner": "numtide",
"repo": "flake-utils", "repo": "flake-utils",
"rev": "5aed5285a952e0b949eb3ba02c12fa4fcfef535f", "rev": "cfacdce06f30d2b68473a46042957675eebb3401",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -87,27 +90,15 @@
} }
}, },
"flake-utils_2": { "flake-utils_2": {
"locked": { "inputs": {
"lastModified": 1667395993, "systems": "systems_2"
"narHash": "sha256-nuEHfE/LcWyuSWnS8t12N1wc105Qtau+/OdUAjtQ0rA=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "5aed5285a952e0b949eb3ba02c12fa4fcfef535f",
"type": "github"
}, },
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"flake-utils_3": {
"locked": { "locked": {
"lastModified": 1667395993, "lastModified": 1685518550,
"narHash": "sha256-nuEHfE/LcWyuSWnS8t12N1wc105Qtau+/OdUAjtQ0rA=", "narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=",
"owner": "numtide", "owner": "numtide",
"repo": "flake-utils", "repo": "flake-utils",
"rev": "5aed5285a952e0b949eb3ba02c12fa4fcfef535f", "rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -139,48 +130,31 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1670525689, "lastModified": 1685860998,
"narHash": "sha256-YIjGzxrRQa5LYO0zlnH/ardcwXsRgsnHe3TkGkvCxbc=", "narHash": "sha256-ZexAPe8yvJaLvn5aVgjW0vY41RnmJGbgOdGBJk1yDIE=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "f21f11aa2a02cb78651c6d57546c7d7541f9240c", "rev": "45d47b647d7bbaede5121d731cbee78f6093b6d6",
"type": "github" "type": "github"
}, },
"original": { "original": {
"owner": "NixOS", "owner": "NixOS",
"ref": "nixpkgs-unstable",
"repo": "nixpkgs", "repo": "nixpkgs",
"type": "github" "type": "github"
} }
}, },
"nixpkgs-stable": { "nixpkgs-stable": {
"locked": { "locked": {
"lastModified": 1668984258, "lastModified": 1678872516,
"narHash": "sha256-0gDMJ2T3qf58xgcSbYoXiRGUkPWmKyr5C3vcathWhKs=", "narHash": "sha256-/E1YwtMtFAu2KUQKV/1+KFuReYPANM2Rzehk84VxVoc=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "cf63ade6f74bbc9d2a017290f1b2e33e8fbfa70a", "rev": "9b8e5abb18324c7fe9f07cb100c3cd4a29cda8b8",
"type": "github" "type": "github"
}, },
"original": { "original": {
"owner": "NixOS", "owner": "NixOS",
"ref": "nixos-22.05", "ref": "nixos-22.11",
"repo": "nixpkgs",
"type": "github"
}
},
"nixpkgs_2": {
"locked": {
"lastModified": 1668994630,
"narHash": "sha256-1lqx6HLyw6fMNX/hXrrETG1vMvZRGm2XVC9O/Jt0T6c=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "af50806f7c6ab40df3e6b239099e8f8385f6c78b",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-unstable",
"repo": "nixpkgs", "repo": "nixpkgs",
"type": "github" "type": "github"
} }
@ -188,17 +162,21 @@
"pre-commit-hooks": { "pre-commit-hooks": {
"inputs": { "inputs": {
"flake-compat": "flake-compat_2", "flake-compat": "flake-compat_2",
"flake-utils": "flake-utils_3", "flake-utils": [
"flake-utils"
],
"gitignore": "gitignore", "gitignore": "gitignore",
"nixpkgs": "nixpkgs_2", "nixpkgs": [
"nixpkgs"
],
"nixpkgs-stable": "nixpkgs-stable" "nixpkgs-stable": "nixpkgs-stable"
}, },
"locked": { "locked": {
"lastModified": 1670413394, "lastModified": 1685361114,
"narHash": "sha256-M7sWqrKtOqUv9euX1t3HCxis8cPy9MNiZxQmUf0KF1o=", "narHash": "sha256-4RjrlSb+OO+e1nzTExKW58o3WRwVGpXwj97iCta8aj4=",
"owner": "cachix", "owner": "cachix",
"repo": "pre-commit-hooks.nix", "repo": "pre-commit-hooks.nix",
"rev": "1303a1a76e9eb074075bfe566518c413f6fc104e", "rev": "ca2fdbf3edda2a38140184da6381d49f8206eaf4",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -229,11 +207,11 @@
] ]
}, },
"locked": { "locked": {
"lastModified": 1670034122, "lastModified": 1683080331,
"narHash": "sha256-EqmuOKucPWtMvCZtHraHr3Q3bgVszq1x2PoZtQkUuEk=", "narHash": "sha256-nGDvJ1DAxZIwdn6ww8IFwzoHb2rqBP4wv/65Wt5vflk=",
"owner": "oxalica", "owner": "oxalica",
"repo": "rust-overlay", "repo": "rust-overlay",
"rev": "a0d5773275ecd4f141d792d3a0376277c0fc0b65", "rev": "d59c3fa0cba8336e115b376c2d9e91053aa59e56",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -252,11 +230,11 @@
] ]
}, },
"locked": { "locked": {
"lastModified": 1670552927, "lastModified": 1685846256,
"narHash": "sha256-lCE51eAGrAFS4k9W5aDGFpVtOAwQQ/rFMN80PCDh0vo=", "narHash": "sha256-G4aYK4VqlMHImvZ0lUnLHw1A+Cx28T0sBMvAKZBcGpk=",
"owner": "oxalica", "owner": "oxalica",
"repo": "rust-overlay", "repo": "rust-overlay",
"rev": "a0fdafd18c9cf599fde17fbaf07dbb20fa57eecb", "rev": "1ef3c6de6127a1cba94cc5492cdde52e33d06ea4",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -264,6 +242,36 @@
"repo": "rust-overlay", "repo": "rust-overlay",
"type": "github" "type": "github"
} }
},
"systems": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
},
"systems_2": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
} }
}, },
"root": "root", "root": "root",

View File

@ -3,7 +3,7 @@
"ripgrep, but also search in PDFs, E-Books, Office documents, zip, tar.gz, etc."; "ripgrep, but also search in PDFs, E-Books, Office documents, zip, tar.gz, etc.";
inputs = { inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable"; nixpkgs.url = "github:NixOS/nixpkgs";
crane = { crane = {
url = "github:ipetkov/crane"; url = "github:ipetkov/crane";
@ -25,7 +25,13 @@
flake = false; flake = false;
}; };
pre-commit-hooks.url = "github:cachix/pre-commit-hooks.nix"; pre-commit-hooks = {
url = "github:cachix/pre-commit-hooks.nix";
inputs = {
nixpkgs.follows = "nixpkgs";
flake-utils.follows = "flake-utils";
};
};
}; };
outputs = { self, nixpkgs, crane, flake-utils, rust-overlay, advisory-db outputs = { self, nixpkgs, crane, flake-utils, rust-overlay, advisory-db
@ -36,14 +42,16 @@
inherit system; inherit system;
overlays = [ (import rust-overlay) ]; overlays = [ (import rust-overlay) ];
}; };
inherit (pkgs) lib;
craneLib = crane.lib.${system}; craneLib = crane.lib.${system};
src = craneLib.cleanCargoSource ./.; src = pkgs.lib.cleanSourceWith {
src = craneLib.path ./.;
filter = pkgs.lib.cleanSourceFilter;
};
buildInputs = with pkgs; buildInputs = with pkgs;
[ ffmpeg imagemagick pandoc poppler_utils ripgrep tesseract ] [ ffmpeg imagemagick pandoc poppler_utils ripgrep tesseract ]
++ lib.optionals pkgs.stdenv.isDarwin [ ++ pkgs.lib.optionals pkgs.stdenv.isDarwin [
# Additional darwin specific inputs can be set here # Additional darwin specific inputs can be set here
pkgs.libiconv pkgs.libiconv
]; ];
@ -54,10 +62,7 @@
# Build the actual crate itself, reusing the dependency # Build the actual crate itself, reusing the dependency
# artifacts from above. # artifacts from above.
rga = craneLib.buildPackage { rga = craneLib.buildPackage { inherit cargoArtifacts src buildInputs; };
inherit cargoArtifacts src buildInputs;
doCheck = false;
};
pre-commit = pre-commit-hooks.lib."${system}".run; pre-commit = pre-commit-hooks.lib."${system}".run;
in { in {
@ -97,18 +102,20 @@
hooks = { hooks = {
nixfmt.enable = true; nixfmt.enable = true;
rustfmt.enable = true; rustfmt.enable = true;
cargo-check.enable = true; typos = {
enable = true;
types = [ "text" ];
excludes = [ "exampledir/.*" ];
};
}; };
}; };
} // lib.optionalAttrs (system == "x86_64-linux") {
# NB: cargo-tarpaulin only supports x86_64 systems
# Check code coverage (note: this will not upload coverage anywhere)
rga-coverage =
craneLib.cargoTarpaulin { inherit cargoArtifacts src; };
}; };
# `nix build` # `nix build`
packages.default = rga; packages = {
inherit rga; # `nix build .#rga`
default = rga; # `nix build`
};
# `nix run` # `nix run`
apps.default = flake-utils.lib.mkApp { drv = rga; }; apps.default = flake-utils.lib.mkApp { drv = rga; };

View File

@ -10,6 +10,7 @@ pub mod writing;
pub mod zip; pub mod zip;
use crate::{adapted_iter::AdaptedFilesIterBox, config::RgaConfig, matching::*}; use crate::{adapted_iter::AdaptedFilesIterBox, config::RgaConfig, matching::*};
use anyhow::{format_err, Context, Result}; use anyhow::{format_err, Context, Result};
use async_trait::async_trait;
use custom::CustomAdapterConfig; use custom::CustomAdapterConfig;
use custom::BUILTIN_SPAWNING_ADAPTERS; use custom::BUILTIN_SPAWNING_ADAPTERS;
use log::*; use log::*;
@ -77,11 +78,17 @@ impl AdapterMeta {
pub trait GetMetadata { pub trait GetMetadata {
fn metadata(&self) -> &AdapterMeta; fn metadata(&self) -> &AdapterMeta;
} }
#[async_trait]
pub trait FileAdapter: GetMetadata + Send + Sync { pub trait FileAdapter: GetMetadata + Send + Sync {
/// adapt a file. /// adapt a file.
/// ///
/// detection_reason is the Matcher that was used to identify this file. Unless --rga-accurate was given, it is always a FastMatcher /// detection_reason is the Matcher that was used to identify this file. Unless --rga-accurate was given, it is always a FastMatcher
fn adapt(&self, a: AdaptInfo, detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox>; async fn adapt(
&self,
a: AdaptInfo,
detection_reason: &FileMatcher,
) -> Result<AdaptedFilesIterBox>;
} }
pub struct AdaptInfo { pub struct AdaptInfo {

View File

@ -49,8 +49,9 @@ pub struct CustomAdapterConfig {
pub args: Vec<String>, pub args: Vec<String>,
/// The output path hint. The placeholders are the same as for `.args` /// The output path hint. The placeholders are the same as for `.args`
/// ///
/// If not set, defaults to ${input_virtual_path}.txt /// If not set, defaults to "${input_virtual_path}.txt"
/// ///
/// Setting this is useful if the output format is not plain text (.txt) but instead some other format that should be passed to another adapter
pub output_path_hint: Option<String>, pub output_path_hint: Option<String>,
} }
@ -128,7 +129,6 @@ lazy_static! {
disabled_by_default: None, disabled_by_default: None,
match_only_by_mime: None, match_only_by_mime: None,
output_path_hint: Some("${input_virtual_path}.txt.asciipagebreaks".into()) output_path_hint: Some("${input_virtual_path}.txt.asciipagebreaks".into())
// postprocessors: [{name: "add_page_numbers_by_pagebreaks"}]
} }
]; ];
} }
@ -143,15 +143,13 @@ pub fn map_exe_error(err: std::io::Error, exe_name: &str, help: &str) -> anyhow:
} }
} }
fn proc_wait(mut child: Child) -> impl AsyncRead { fn proc_wait(mut child: Child, context: impl FnOnce() -> String) -> impl AsyncRead {
let s = stream! { let s = stream! {
let res = child.wait().await?; let res = child.wait().await?;
if res.success() { if res.success() {
yield std::io::Result::Ok(Bytes::new()); yield std::io::Result::Ok(Bytes::new());
} else { } else {
yield std::io::Result::Err(to_io_err( Err(format_err!("{:?}", res)).with_context(context).map_err(to_io_err)?;
format_err!("subprocess failed: {:?}", res),
));
} }
}; };
StreamReader::new(s) StreamReader::new(s)
@ -164,6 +162,7 @@ pub fn pipe_output(
exe_name: &str, exe_name: &str,
help: &str, help: &str,
) -> Result<ReadBox> { ) -> Result<ReadBox> {
let cmd_log = format!("{:?}", cmd); // todo: perf
let mut cmd = cmd let mut cmd = cmd
.stdin(Stdio::piped()) .stdin(Stdio::piped())
.stdout(Stdio::piped()) .stdout(Stdio::piped())
@ -177,10 +176,9 @@ pub fn pipe_output(
tokio::io::copy(&mut z, &mut stdi).await?; tokio::io::copy(&mut z, &mut stdi).await?;
std::io::Result::Ok(()) std::io::Result::Ok(())
}); });
Ok(Box::pin(stdo.chain(
Ok(Box::pin( proc_wait(cmd, move || format!("subprocess: {cmd_log}")).chain(join_handle_to_stream(join)),
stdo.chain(proc_wait(cmd).chain(join_handle_to_stream(join))), )))
))
} }
pub struct CustomSpawningFileAdapter { pub struct CustomSpawningFileAdapter {
@ -224,8 +222,9 @@ impl CustomSpawningFileAdapter {
Ok(command) Ok(command)
} }
} }
#[async_trait]
impl FileAdapter for CustomSpawningFileAdapter { impl FileAdapter for CustomSpawningFileAdapter {
fn adapt<'a>( async fn adapt(
&self, &self,
ai: AdaptInfo, ai: AdaptInfo,
_detection_reason: &FileMatcher, _detection_reason: &FileMatcher,
@ -314,7 +313,7 @@ mod test {
let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?)); let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
// let r = adapter.adapt(a, &d)?; // let r = adapter.adapt(a, &d)?;
let r = loop_adapt(&adapter, d, a)?; let r = loop_adapt(&adapter, d, a).await?;
let o = adapted_to_vec(r).await?; let o = adapted_to_vec(r).await?;
assert_eq!( assert_eq!(
String::from_utf8(o)?, String::from_utf8(o)?,
@ -368,7 +367,7 @@ PREFIX:Page 1:
Path::new("foo.txt"), Path::new("foo.txt"),
Box::pin(Cursor::new(Vec::from(input))), Box::pin(Cursor::new(Vec::from(input))),
); );
let output = adapter.adapt(a, &d).unwrap(); let output = adapter.adapt(a, &d).await.unwrap();
let oup = adapted_to_vec(output).await?; let oup = adapted_to_vec(output).await?;
println!("output: {}", String::from_utf8_lossy(&oup)); println!("output: {}", String::from_utf8_lossy(&oup));

View File

@ -93,8 +93,13 @@ fn get_inner_filename(filename: &Path) -> PathBuf {
filename.with_file_name(format!("{}{}", stem, new_extension)) filename.with_file_name(format!("{}{}", stem, new_extension))
} }
#[async_trait]
impl FileAdapter for DecompressAdapter { impl FileAdapter for DecompressAdapter {
fn adapt(&self, ai: AdaptInfo, detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox> { async fn adapt(
&self,
ai: AdaptInfo,
detection_reason: &FileMatcher,
) -> Result<AdaptedFilesIterBox> {
Ok(one_file(AdaptInfo { Ok(one_file(AdaptInfo {
filepath_hint: get_inner_filename(&ai.filepath_hint), filepath_hint: get_inner_filename(&ai.filepath_hint),
is_real_file: false, is_real_file: false,
@ -137,7 +142,7 @@ mod tests {
let filepath = test_data_dir().join("hello.gz"); let filepath = test_data_dir().join("hello.gz");
let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?)); let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
let r = adapter.adapt(a, &d)?; let r = adapter.adapt(a, &d).await?;
let o = adapted_to_vec(r).await?; let o = adapted_to_vec(r).await?;
assert_eq!(String::from_utf8(o)?, "hello\n"); assert_eq!(String::from_utf8(o)?, "hello\n");
Ok(()) Ok(())
@ -150,7 +155,7 @@ mod tests {
let filepath = test_data_dir().join("short.pdf.gz"); let filepath = test_data_dir().join("short.pdf.gz");
let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?)); let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
let r = loop_adapt(&adapter, d, a)?; let r = loop_adapt(&adapter, d, a).await?;
let o = adapted_to_vec(r).await?; let o = adapted_to_vec(r).await?;
assert_eq!( assert_eq!(
String::from_utf8(o)?, String::from_utf8(o)?,

View File

@ -14,13 +14,15 @@ use writing::WritingFileAdapter;
// maybe todo: read list of extensions from // maybe todo: read list of extensions from
// ffmpeg -demuxers | tail -n+5 | awk '{print $2}' | while read demuxer; do echo MUX=$demuxer; ffmpeg -h demuxer=$demuxer | grep 'Common extensions'; done 2>/dev/null // ffmpeg -demuxers | tail -n+5 | awk '{print $2}' | while read demuxer; do echo MUX=$demuxer; ffmpeg -h demuxer=$demuxer | grep 'Common extensions'; done 2>/dev/null
// but really, the probability of getting useful information from a .flv is low // but really, the probability of getting useful information from a .flv is low
static EXTENSIONS: &[&str] = &["mkv", "mp4", "avi"]; static EXTENSIONS: &[&str] = &["mkv", "mp4", "avi", "mp3", "ogg", "flac", "webm"];
lazy_static! { lazy_static! {
static ref METADATA: AdapterMeta = AdapterMeta { static ref METADATA: AdapterMeta = AdapterMeta {
name: "ffmpeg".to_owned(), name: "ffmpeg".to_owned(),
version: 1, version: 1,
description: "Uses ffmpeg to extract video metadata/chapters and subtitles".to_owned(), description:
"Uses ffmpeg to extract video metadata/chapters, subtitles, lyrics, and other metadata"
.to_owned(),
recurses: false, recurses: false,
fast_matchers: EXTENSIONS fast_matchers: EXTENSIONS
.iter() .iter()
@ -52,7 +54,7 @@ struct FFprobeOutput {
} }
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
struct FFprobeStream { struct FFprobeStream {
codec_type: String, // video,audio,subtitle index: i32, // stream index
} }
#[async_trait] #[async_trait]
@ -78,17 +80,17 @@ impl WritingFileAdapter for FFmpegAdapter {
} }
let inp_fname = filepath_hint; let inp_fname = filepath_hint;
let spawn_fail = |e| map_exe_error(e, "ffprobe", "Make sure you have ffmpeg installed."); let spawn_fail = |e| map_exe_error(e, "ffprobe", "Make sure you have ffmpeg installed.");
let has_subtitles = { let subtitle_streams = {
let probe = Command::new("ffprobe") let probe = Command::new("ffprobe")
.args(vec![ .args(vec![
"-v", "-v",
"error", "error", // show all errors
"-select_streams", "-select_streams",
"s", "s", // show only subtitle streams
"-of", "-of",
"json", "json", // use json as output format
"-show_entries", "-show_entries",
"stream=codec_type", "stream=index", // show index of subtitle streams
]) ])
.arg("-i") .arg("-i")
.arg(&inp_fname) .arg(&inp_fname)
@ -96,10 +98,14 @@ impl WritingFileAdapter for FFmpegAdapter {
.await .await
.map_err(spawn_fail)?; .map_err(spawn_fail)?;
if !probe.status.success() { if !probe.status.success() {
return Err(format_err!("ffprobe failed: {:?}", probe.status)); return Err(format_err!(
"ffprobe failed: {:?}\n{}",
probe.status,
String::from_utf8_lossy(&probe.stderr)
));
} }
let p: FFprobeOutput = serde_json::from_slice(&probe.stdout)?; let p: FFprobeOutput = serde_json::from_slice(&probe.stdout)?;
!p.streams.is_empty() p.streams
}; };
{ {
// extract file metadata (especially chapter names in a greppable format) // extract file metadata (especially chapter names in a greppable format)
@ -124,6 +130,7 @@ impl WritingFileAdapter for FFmpegAdapter {
.spawn()?; .spawn()?;
let mut lines = BufReader::new(probe.stdout.as_mut().unwrap()).lines(); let mut lines = BufReader::new(probe.stdout.as_mut().unwrap()).lines();
while let Some(line) = lines.next_line().await? { while let Some(line) = lines.next_line().await? {
let line = line.replace("\\r\\n", "\n").replace("\\n", "\n"); // just unescape newlines
async_writeln!(oup, "metadata: {line}")?; async_writeln!(oup, "metadata: {line}")?;
} }
let exit = probe.wait().await?; let exit = probe.wait().await?;
@ -131,7 +138,8 @@ impl WritingFileAdapter for FFmpegAdapter {
return Err(format_err!("ffprobe failed: {:?}", exit)); return Err(format_err!("ffprobe failed: {:?}", exit));
} }
} }
if has_subtitles { if !subtitle_streams.is_empty() {
for probe_stream in subtitle_streams.iter() {
// extract subtitles // extract subtitles
let mut cmd = Command::new("ffmpeg"); let mut cmd = Command::new("ffmpeg");
cmd.arg("-hide_banner") cmd.arg("-hide_banner")
@ -139,6 +147,8 @@ impl WritingFileAdapter for FFmpegAdapter {
.arg("panic") .arg("panic")
.arg("-i") .arg("-i")
.arg(&inp_fname) .arg(&inp_fname)
.arg("-map")
.arg(format!("0:{}", probe_stream.index)) // 0 for first input
.arg("-f") .arg("-f")
.arg("webvtt") .arg("webvtt")
.arg("-"); .arg("-");
@ -159,6 +169,7 @@ impl WritingFileAdapter for FFmpegAdapter {
} }
} }
} }
}
Ok(()) Ok(())
} }
} }

View File

@ -4,7 +4,11 @@
use anyhow::Result; use anyhow::Result;
use async_stream::stream; use async_stream::stream;
use async_trait::async_trait;
use bytes::Bytes; use bytes::Bytes;
use encoding_rs::Encoding;
use encoding_rs_io::DecodeReaderBytesBuilder;
use tokio_util::io::SyncIoBridge;
use std::io::Cursor; use std::io::Cursor;
use std::path::PathBuf; use std::path::PathBuf;
@ -41,15 +45,16 @@ impl GetMetadata for PostprocPrefix {
&METADATA &METADATA
} }
} }
#[async_trait]
impl FileAdapter for PostprocPrefix { impl FileAdapter for PostprocPrefix {
fn adapt<'a>( async fn adapt(
&self, &self,
a: super::AdaptInfo, a: super::AdaptInfo,
_detection_reason: &crate::matching::FileMatcher, _detection_reason: &crate::matching::FileMatcher,
) -> Result<AdaptedFilesIterBox> { ) -> Result<AdaptedFilesIterBox> {
let read = add_newline(postproc_prefix( let read = add_newline(postproc_prefix(
&a.line_prefix, &a.line_prefix,
postproc_encoding(&a.line_prefix, a.inp)?, postproc_encoding(&a.line_prefix, a.inp).await?,
)); ));
// keep adapt info (filename etc) except replace inp // keep adapt info (filename etc) except replace inp
let ai = AdaptInfo { let ai = AdaptInfo {
@ -74,50 +79,53 @@ impl Read for ReadErr {
* Detects and converts encodings other than utf-8 to utf-8. * Detects and converts encodings other than utf-8 to utf-8.
* If the input stream does not contain valid text, returns the string `[rga: binary data]` instead * If the input stream does not contain valid text, returns the string `[rga: binary data]` instead
*/ */
pub fn postproc_encoding( async fn postproc_encoding(
_line_prefix: &str, _line_prefix: &str,
inp: impl AsyncRead + Send + 'static, inp: Pin<Box<dyn AsyncRead + Send>>,
) -> Result<Pin<Box<dyn AsyncRead + Send>>> { ) -> Result<Pin<Box<dyn AsyncRead + Send>>> {
Ok(Box::pin(inp)) // check for binary content in first 8kB
// panic!("todo: implement"); // read the first 8kB into a buffer, check for null bytes, then return the buffer concatenated with the rest of the file
/*// TODO: parse these options from ripgrep's configuration let mut fourk = Vec::with_capacity(1 << 13);
let mut beginning = inp.take(1 << 13);
beginning.read_to_end(&mut fourk).await?;
let has_binary = fourk.contains(&0u8);
let enc = Encoding::for_bom(&fourk);
let inp = Cursor::new(fourk).chain(beginning.into_inner());
match enc {
Some((enc, _)) if enc != encoding_rs::UTF_8 => {
// detected UTF16LE or UTF16BE, convert to UTF8 in separate thread
// TODO: parse these options from ripgrep's configuration
let encoding = None; // detect bom but usually assume utf8 let encoding = None; // detect bom but usually assume utf8
let bom_sniffing = true; let bom_sniffing = true;
let mut decode_builder = DecodeReaderBytesBuilder::new(); let mut decode_builder = DecodeReaderBytesBuilder::new();
// https://github.com/BurntSushi/ripgrep/blob/a7d26c8f144a4957b75f71087a66692d0b25759a/grep-searcher/src/searcher/mod.rs#L706 // https://github.com/BurntSushi/ripgrep/blob/a7d26c8f144a4957b75f71087a66692d0b25759a/grep-searcher/src/searcher/mod.rs#L706
// this detects utf-16 BOMs and transcodes to utf-8 if they are present // this detects utf-16 BOMs and transcodes to utf-8 if they are present
// it does not detect any other char encodings. that would require https://github.com/hsivonen/chardetng or similar but then binary detection is hard (?) // it does not detect any other char encodings. that would require https://github.com/hsivonen/chardetng or similar but then binary detection is hard (?)
let inp = decode_builder let mut inp = decode_builder
.encoding(encoding) .encoding(encoding)
.utf8_passthru(true) .utf8_passthru(true)
.strip_bom(bom_sniffing) .strip_bom(bom_sniffing)
.bom_override(true) .bom_override(true)
.bom_sniffing(bom_sniffing) .bom_sniffing(bom_sniffing)
.build(inp); .build(SyncIoBridge::new(inp));
let oup = tokio::task::spawn_blocking(move || -> Result<Vec<u8>> {
// check for binary content in first 8kB let mut oup = Vec::new();
// read the first 8kB into a buffer, check for null bytes, then return the buffer concatenated with the rest of the file std::io::Read::read_to_end(&mut inp, &mut oup)?;
let mut fourk = Vec::with_capacity(1 << 13); Ok(oup)
let mut beginning = inp.take(1 << 13); })
.await??;
beginning.read_to_end(&mut fourk)?; Ok(Box::pin(Cursor::new(oup)))
}
if fourk.contains(&0u8) { _ => {
log::debug!("detected binary"); if has_binary {
let v = "[rga: binary data]"; log::debug!("detected binary");
return Ok(Box::new(std::io::Cursor::new(v))); return Ok(Box::pin(Cursor::new("[rga: binary data]")));
/*let err = std::io::Error::new( }
std::io::ErrorKind::InvalidData, Ok(Box::pin(inp))
format!("{}[rga: binary data]", line_prefix), }
);
return Err(err).context("");
return ReadErr {
err,
};*/
} }
Ok(Box::new(
std::io::Cursor::new(fourk).chain(beginning.into_inner()),
))*/
} }
/// Adds the given prefix to each line in an `AsyncRead`. /// Adds the given prefix to each line in an `AsyncRead`.
@ -164,13 +172,14 @@ impl GetMetadata for PostprocPageBreaks {
&METADATA &METADATA
} }
} }
#[async_trait]
impl FileAdapter for PostprocPageBreaks { impl FileAdapter for PostprocPageBreaks {
fn adapt<'a>( async fn adapt(
&self, &self,
a: super::AdaptInfo, a: super::AdaptInfo,
_detection_reason: &crate::matching::FileMatcher, _detection_reason: &crate::matching::FileMatcher,
) -> Result<AdaptedFilesIterBox> { ) -> Result<AdaptedFilesIterBox> {
let read = postproc_pagebreaks(postproc_encoding(&a.line_prefix, a.inp)?); let read = postproc_pagebreaks(postproc_encoding(&a.line_prefix, a.inp).await?);
// keep adapt info (filename etc) except replace inp // keep adapt info (filename etc) except replace inp
let ai = AdaptInfo { let ai = AdaptInfo {
inp: Box::pin(read), inp: Box::pin(read),
@ -282,7 +291,7 @@ mod tests {
let fname = test_data_dir().join("twoblankpages.pdf"); let fname = test_data_dir().join("twoblankpages.pdf");
let rd = File::open(&fname).await?; let rd = File::open(&fname).await?;
let (a, d) = simple_adapt_info(&fname, Box::pin(rd)); let (a, d) = simple_adapt_info(&fname, Box::pin(rd));
let res = loop_adapt(&adapter, d, a)?; let res = loop_adapt(&adapter, d, a).await?;
let buf = adapted_to_vec(res).await?; let buf = adapted_to_vec(res).await?;
@ -327,7 +336,8 @@ PREFIX:Page 3:
b: &str, b: &str,
) -> Result<()> { ) -> Result<()> {
let mut oup = Vec::new(); let mut oup = Vec::new();
let inp = postproc_encoding("", a)?; let inp = Box::pin(Cursor::new(a));
let inp = postproc_encoding("", inp).await?;
if pagebreaks { if pagebreaks {
postproc_pagebreaks(inp).read_to_end(&mut oup).await?; postproc_pagebreaks(inp).read_to_end(&mut oup).await?;
} else { } else {
@ -341,6 +351,23 @@ PREFIX:Page 3:
Ok(()) Ok(())
} }
#[tokio::test]
async fn test_utf16() -> Result<()> {
let utf16lebom: &[u8] = &[
0xff, 0xfe, 0x68, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x20, 0x00,
0x77, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x6c, 0x00, 0x64, 0x00, 0x20, 0x00, 0x3d, 0xd8,
0xa9, 0xdc, 0x0a, 0x00,
];
let utf16bebom: &[u8] = &[
0xfe, 0xff, 0x00, 0x68, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x20,
0x00, 0x77, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x6c, 0x00, 0x64, 0x00, 0x20, 0xd8, 0x3d,
0xdc, 0xa9, 0x00, 0x0a,
];
test_from_bytes(false, "", utf16lebom, "hello world 💩\n").await?;
test_from_bytes(false, "", utf16bebom, "hello world 💩\n").await?;
Ok(())
}
#[tokio::test] #[tokio::test]
async fn post1() -> Result<()> { async fn post1() -> Result<()> {
let inp = "What is this\nThis is a test\nFoo"; let inp = "What is this\nThis is a test\nFoo";
@ -362,8 +389,7 @@ PREFIX:Page 3:
Ok(()) Ok(())
} }
/*
todo: uncomment when fixed
#[tokio::test] #[tokio::test]
async fn test_binary_content() -> Result<()> { async fn test_binary_content() -> Result<()> {
test_from_strs( test_from_strs(
@ -375,7 +401,7 @@ PREFIX:Page 3:
.await?; .await?;
test_from_strs(false, "foo:", "\0", "foo:[rga: binary data]").await?; test_from_strs(false, "foo:", "\0", "foo:[rga: binary data]").await?;
Ok(()) Ok(())
}*/ }
/*#[test] /*#[test]
fn chardet() -> Result<()> { fn chardet() -> Result<()> {

View File

@ -77,11 +77,13 @@ fn synchronous_dump_sqlite(ai: AdaptInfo, mut s: impl Write) -> Result<()> {
return Ok(()); return Ok(());
} }
let inp_fname = filepath_hint; let inp_fname = filepath_hint;
let conn = Connection::open_with_flags(&inp_fname, OpenFlags::SQLITE_OPEN_READ_ONLY)
let conn = Connection::open_with_flags(inp_fname, OpenFlags::SQLITE_OPEN_READ_ONLY)?; .with_context(|| format!("opening sqlite connection to {}", inp_fname.display()))?;
let tables: Vec<String> = conn let tables: Vec<String> = conn
.prepare("select name from sqlite_master where type='table'")? .prepare("select name from sqlite_master where type='table'")
.query_map([], |r| r.get::<_, String>(0))? .context("while preparing query")?
.query_map([], |r| r.get::<_, String>(0))
.context("while executing query")?
.filter_map(|e| e.ok()) .filter_map(|e| e.ok())
.collect(); .collect();
debug!("db has {} tables", tables.len()); debug!("db has {} tables", tables.len());
@ -121,7 +123,9 @@ impl WritingFileAdapter for SqliteAdapter {
oup: Pin<Box<dyn AsyncWrite + Send>>, oup: Pin<Box<dyn AsyncWrite + Send>>,
) -> Result<()> { ) -> Result<()> {
let oup_sync = SyncIoBridge::new(oup); let oup_sync = SyncIoBridge::new(oup);
tokio::task::spawn_blocking(|| synchronous_dump_sqlite(ai, oup_sync)).await??; tokio::task::spawn_blocking(|| synchronous_dump_sqlite(ai, oup_sync))
.await?
.context("in synchronous sqlite task")?;
Ok(()) Ok(())
} }
} }
@ -134,10 +138,10 @@ mod test {
#[tokio::test] #[tokio::test]
async fn simple() -> Result<()> { async fn simple() -> Result<()> {
let adapter: Box<dyn FileAdapter> = Box::new(SqliteAdapter::default()); let adapter: Box<dyn FileAdapter> = Box::<SqliteAdapter>::default();
let fname = test_data_dir().join("hello.sqlite3"); let fname = test_data_dir().join("hello.sqlite3");
let (a, d) = simple_fs_adapt_info(&fname).await?; let (a, d) = simple_fs_adapt_info(&fname).await?;
let res = adapter.adapt(a, &d)?; let res = adapter.adapt(a, &d).await?;
let buf = adapted_to_vec(res).await?; let buf = adapted_to_vec(res).await?;

View File

@ -6,6 +6,7 @@ use crate::{
}; };
use anyhow::*; use anyhow::*;
use async_stream::stream; use async_stream::stream;
use async_trait::async_trait;
use lazy_static::lazy_static; use lazy_static::lazy_static;
use log::*; use log::*;
use std::path::PathBuf; use std::path::PathBuf;
@ -45,8 +46,13 @@ impl GetMetadata for TarAdapter {
} }
} }
#[async_trait]
impl FileAdapter for TarAdapter { impl FileAdapter for TarAdapter {
fn adapt(&self, ai: AdaptInfo, _detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox> { async fn adapt(
&self,
ai: AdaptInfo,
_detection_reason: &FileMatcher,
) -> Result<AdaptedFilesIterBox> {
let AdaptInfo { let AdaptInfo {
filepath_hint, filepath_hint,
inp, inp,
@ -103,7 +109,7 @@ mod tests {
let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?)); let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
let adapter = TarAdapter::new(); let adapter = TarAdapter::new();
let r = loop_adapt(&adapter, d, a).context("adapt")?; let r = loop_adapt(&adapter, d, a).await.context("adapt")?;
let o = adapted_to_vec(r).await.context("adapted_to_vec")?; let o = adapted_to_vec(r).await.context("adapted_to_vec")?;
assert_eq!( assert_eq!(
String::from_utf8(o).context("parsing utf8")?, String::from_utf8(o).context("parsing utf8")?,

View File

@ -3,7 +3,7 @@ use std::pin::Pin;
use crate::{adapted_iter::one_file, join_handle_to_stream, to_io_err}; use crate::{adapted_iter::one_file, join_handle_to_stream, to_io_err};
use super::{AdaptInfo, FileAdapter, GetMetadata}; use super::{AdaptInfo, FileAdapter, GetMetadata};
use anyhow::Result; use anyhow::{Context, Result};
use async_trait::async_trait; use async_trait::async_trait;
use tokio::io::{AsyncReadExt, AsyncWrite}; use tokio::io::{AsyncReadExt, AsyncWrite};
@ -41,15 +41,17 @@ macro_rules! async_writeln {
} }
pub(crate) use async_writeln; pub(crate) use async_writeln;
#[async_trait]
impl<T> FileAdapter for T impl<T> FileAdapter for T
where where
T: WritingFileAdapter, T: WritingFileAdapter,
{ {
fn adapt( async fn adapt(
&self, &self,
a: super::AdaptInfo, a: super::AdaptInfo,
detection_reason: &crate::matching::FileMatcher, detection_reason: &crate::matching::FileMatcher,
) -> Result<crate::adapted_iter::AdaptedFilesIterBox> { ) -> Result<crate::adapted_iter::AdaptedFilesIterBox> {
let name = self.metadata().name.clone();
let (w, r) = tokio::io::duplex(128 * 1024); let (w, r) = tokio::io::duplex(128 * 1024);
let d2 = detection_reason.clone(); let d2 = detection_reason.clone();
let archive_recursion_depth = a.archive_recursion_depth + 1; let archive_recursion_depth = a.archive_recursion_depth + 1;
@ -59,7 +61,10 @@ where
let config = a.config.clone(); let config = a.config.clone();
let joiner = tokio::spawn(async move { let joiner = tokio::spawn(async move {
let x = d2; let x = d2;
T::adapt_write(a, &x, Box::pin(w)).await.map_err(to_io_err) T::adapt_write(a, &x, Box::pin(w))
.await
.with_context(|| format!("in {}.adapt_write", name))
.map_err(to_io_err)
}); });
Ok(one_file(AdaptInfo { Ok(one_file(AdaptInfo {

View File

@ -5,7 +5,7 @@ use async_stream::stream;
use lazy_static::lazy_static; use lazy_static::lazy_static;
use log::*; use log::*;
static EXTENSIONS: &[&str] = &["zip"]; static EXTENSIONS: &[&str] = &["zip", "jar"];
lazy_static! { lazy_static! {
static ref METADATA: AdapterMeta = AdapterMeta { static ref METADATA: AdapterMeta = AdapterMeta {
@ -36,8 +36,13 @@ impl GetMetadata for ZipAdapter {
} }
} }
#[async_trait]
impl FileAdapter for ZipAdapter { impl FileAdapter for ZipAdapter {
fn adapt(&self, ai: AdaptInfo, _detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox> { async fn adapt(
&self,
ai: AdaptInfo,
_detection_reason: &FileMatcher,
) -> Result<AdaptedFilesIterBox> {
// let (s, r) = mpsc::channel(1); // let (s, r) = mpsc::channel(1);
let AdaptInfo { let AdaptInfo {
inp, inp,
@ -52,11 +57,11 @@ impl FileAdapter for ZipAdapter {
if is_real_file { if is_real_file {
use async_zip::read::fs::ZipFileReader; use async_zip::read::fs::ZipFileReader;
let s = stream! {
let zip = ZipFileReader::new(&filepath_hint).await?; let zip = ZipFileReader::new(&filepath_hint).await?;
for i in 0..zip.entries().len() { let s = stream! {
let reader = zip.entry_reader(i).await?; for i in 0..zip.file().entries().len() {
let file = reader.entry(); let file = zip.get_entry(i)?;
let reader = zip.entry(i).await?;
if file.filename().ends_with('/') { if file.filename().ends_with('/') {
continue; continue;
} }
@ -98,10 +103,11 @@ impl FileAdapter for ZipAdapter {
let mut zip = ZipFileReader::new(inp); let mut zip = ZipFileReader::new(inp);
let s = stream! { let s = stream! {
while !zip.finished() { while let Some(mut entry) = zip.next_entry().await? {
if let Some(reader) = zip.entry_reader().await? { let file = entry.entry();
let file = reader.entry();
if file.filename().ends_with('/') { if file.filename().ends_with('/') {
zip = entry.skip().await?;
continue; continue;
} }
debug!( debug!(
@ -114,6 +120,7 @@ impl FileAdapter for ZipAdapter {
); );
let new_line_prefix = format!("{}{}: ", line_prefix, file.filename()); let new_line_prefix = format!("{}{}: ", line_prefix, file.filename());
let fname = PathBuf::from(file.filename()); let fname = PathBuf::from(file.filename());
let reader = entry.reader();
tokio::pin!(reader); tokio::pin!(reader);
// SAFETY: this should be solvable without unsafe but idk how :( // SAFETY: this should be solvable without unsafe but idk how :(
// the issue is that ZipEntryReader borrows from ZipFileReader, but we need to yield it here into the stream // the issue is that ZipEntryReader borrows from ZipFileReader, but we need to yield it here into the stream
@ -133,7 +140,8 @@ impl FileAdapter for ZipAdapter {
postprocess, postprocess,
config: config.clone(), config: config.clone(),
}); });
} zip = entry.done().await.context("going to next file in zip but entry was not read fully")?;
} }
}; };
@ -183,7 +191,6 @@ impl<'a> AdaptedFilesIter for ZipAdaptIter<'a> {
mod test { mod test {
use async_zip::{write::ZipFileWriter, Compression, ZipEntryBuilder}; use async_zip::{write::ZipFileWriter, Compression, ZipEntryBuilder};
use super::*; use super::*;
use crate::{preproc::loop_adapt, test_utils::*}; use crate::{preproc::loop_adapt, test_utils::*};
use pretty_assertions::assert_eq; use pretty_assertions::assert_eq;
@ -213,7 +220,7 @@ mod test {
async fn only_seek_zip_fs() -> Result<()> { async fn only_seek_zip_fs() -> Result<()> {
let zip = test_data_dir().join("only-seek-zip.zip"); let zip = test_data_dir().join("only-seek-zip.zip");
let (a, d) = simple_fs_adapt_info(&zip).await?; let (a, d) = simple_fs_adapt_info(&zip).await?;
let _v = adapted_to_vec(loop_adapt(&ZipAdapter::new(), d, a)?).await?; let _v = adapted_to_vec(loop_adapt(&ZipAdapter::new(), d, a).await?).await?;
// assert_eq!(String::from_utf8(v)?, ""); // assert_eq!(String::from_utf8(v)?, "");
Ok(()) Ok(())
@ -236,7 +243,7 @@ mod test {
&PathBuf::from("outer.zip"), &PathBuf::from("outer.zip"),
Box::pin(std::io::Cursor::new(zipfile)), Box::pin(std::io::Cursor::new(zipfile)),
); );
let buf = adapted_to_vec(loop_adapt(&adapter, d, a)?).await?; let buf = adapted_to_vec(loop_adapt(&adapter, d, a).await?).await?;
assert_eq!( assert_eq!(
String::from_utf8(buf)?, String::from_utf8(buf)?,

View File

@ -43,7 +43,7 @@ async fn main() -> anyhow::Result<()> {
// happens if e.g. ripgrep detects binary data in the pipe so it cancels reading // happens if e.g. ripgrep detects binary data in the pipe so it cancels reading
debug!("output cancelled (broken pipe)"); debug!("output cancelled (broken pipe)");
} else { } else {
Err(e).context("copying adapter output to stdout {}")?; Err(e).context("copying adapter output to stdout")?;
} }
} }
debug!("running adapter took {} total", print_dur(start)); debug!("running adapter took {} total", print_dur(start));

View File

@ -1,17 +1,17 @@
use std::pin::Pin; use std::{future::Future, pin::Pin};
use anyhow::Result; use anyhow::{Context, Result};
use async_compression::tokio::write::ZstdEncoder; use async_compression::tokio::write::ZstdEncoder;
use async_stream::stream; use async_stream::stream;
use crate::to_io_err;
use log::*; use log::*;
use tokio::io::{AsyncRead, AsyncWriteExt}; use tokio::io::{AsyncRead, AsyncWriteExt};
use tokio_stream::StreamExt; use tokio_stream::StreamExt;
use tokio_util::io::{ReaderStream, StreamReader}; use tokio_util::io::{ReaderStream, StreamReader};
use crate::to_io_err; type FinishHandler =
dyn FnOnce((u64, Option<Vec<u8>>)) -> Pin<Box<dyn Future<Output = Result<()>> + Send>> + Send;
type FinishHandler = dyn FnOnce((u64, Option<Vec<u8>>)) -> Result<()> + Send;
/** /**
* wrap a AsyncRead so that it is passthrough, * wrap a AsyncRead so that it is passthrough,
* but also the written data is compressed and written into a buffer, * but also the written data is compressed and written into a buffer,
@ -26,7 +26,7 @@ pub fn async_read_and_write_to_cache<'a>(
let inp = Box::pin(inp); let inp = Box::pin(inp);
let mut zstd_writer = Some(ZstdEncoder::with_quality( let mut zstd_writer = Some(ZstdEncoder::with_quality(
Vec::new(), Vec::new(),
async_compression::Level::Precise(compression_level as u32), async_compression::Level::Precise(compression_level),
)); ));
let mut bytes_written = 0; let mut bytes_written = 0;
@ -64,7 +64,7 @@ pub fn async_read_and_write_to_cache<'a>(
}; };
// EOF, finish! // EOF, finish!
on_finish(finish) on_finish(finish).await.context("write_to_cache on_finish")
.map_err(to_io_err)?; .map_err(to_io_err)?;
}; };

View File

@ -108,6 +108,7 @@ impl FromStr for CacheMaxBlobLen {
rename_all = "kebab-case", rename_all = "kebab-case",
about = env!("CARGO_PKG_DESCRIPTION"), about = env!("CARGO_PKG_DESCRIPTION"),
author = env!("CARGO_PKG_HOMEPAGE"), author = env!("CARGO_PKG_HOMEPAGE"),
long_about="rga: ripgrep, but also search in PDFs, E-Books, Office documents, zip, tar.gz, etc.",
// TODO: long_about does not seem to work to only show this on short help // TODO: long_about does not seem to work to only show this on short help
after_help = "-h shows a concise overview, --help shows more detail and advanced options.\n\nAll other options not shown here are passed directly to rg, especially [PATTERN] and [PATH ...]", after_help = "-h shows a concise overview, --help shows more detail and advanced options.\n\nAll other options not shown here are passed directly to rg, especially [PATTERN] and [PATH ...]",
usage = "rga [RGA OPTIONS] [RG OPTIONS] PATTERN [PATH ...]" usage = "rga [RGA OPTIONS] [RG OPTIONS] PATTERN [PATH ...]"
@ -197,9 +198,9 @@ pub struct CacheConfig {
/// Disable caching of results /// Disable caching of results
/// ///
/// By default, rga caches the extracted text, if it is small enough, /// By default, rga caches the extracted text, if it is small enough,
/// to a database in ~/.cache/rga on Linux, /// to a database in ${XDG_CACHE_DIR-~/.cache}/ripgrep-all on Linux,
/// ~/Library/Caches/rga on macOS, /// ~/Library/Caches/ripgrep-all on macOS,
/// or C:\Users\username\AppData\Local\rga on Windows. /// or C:\Users\username\AppData\Local\ripgrep-all on Windows.
/// This way, repeated searches on the same set of files will be much faster. /// This way, repeated searches on the same set of files will be much faster.
/// If you pass this flag, all caching will be disabled. /// If you pass this flag, all caching will be disabled.
#[serde(default, skip_serializing_if = "is_default")] #[serde(default, skip_serializing_if = "is_default")]
@ -208,7 +209,9 @@ pub struct CacheConfig {
/// Max compressed size to cache /// Max compressed size to cache
/// ///
/// Longest byte length (after compression) to store in cache. Longer adapter outputs will not be cached and recomputed every time. Allowed suffixes: k M G /// Longest byte length (after compression) to store in cache. Longer adapter outputs will not be cached and recomputed every time.
///
/// Allowed suffixes on command line: k M G
#[serde(default, skip_serializing_if = "is_default")] #[serde(default, skip_serializing_if = "is_default")]
#[structopt( #[structopt(
default_value, default_value,

View File

@ -3,25 +3,28 @@ use crate::adapters::*;
use crate::caching_writer::async_read_and_write_to_cache; use crate::caching_writer::async_read_and_write_to_cache;
use crate::config::RgaConfig; use crate::config::RgaConfig;
use crate::matching::*; use crate::matching::*;
use crate::preproc_cache::CacheKey;
use crate::recurse::concat_read_streams; use crate::recurse::concat_read_streams;
use crate::{ use crate::{
preproc_cache::{LmdbCache, PreprocCache}, preproc_cache::{open_cache_db, PreprocCache},
print_bytes, print_bytes,
}; };
use anyhow::*; use anyhow::*;
use async_compression::tokio::bufread::ZstdDecoder; use async_compression::tokio::bufread::ZstdDecoder;
use async_stream::stream; use async_stream::stream;
// use futures::future::{BoxFuture, FutureExt};
use log::*; use log::*;
use path_clean::PathClean;
use postproc::PostprocPrefix; use postproc::PostprocPrefix;
use std::future::Future;
use std::io::Cursor; use std::io::Cursor;
use std::path::Path; use std::path::Path;
use std::pin::Pin;
use std::sync::Arc; use std::sync::Arc;
use tokio::io::AsyncBufRead; use tokio::io::AsyncBufRead;
use tokio::io::AsyncBufReadExt; use tokio::io::AsyncBufReadExt;
use tokio::io::BufReader; use tokio::io::BufReader;
type ActiveAdapters = Vec<Arc<dyn FileAdapter>>; pub type ActiveAdapters = Vec<Arc<dyn FileAdapter>>;
async fn choose_adapter( async fn choose_adapter(
config: &RgaConfig, config: &RgaConfig,
@ -120,36 +123,6 @@ pub async fn rga_preproc(ai: AdaptInfo) -> Result<ReadBox> {
.with_context(|| format!("run_adapter({})", &path_hint_copy.to_string_lossy())) .with_context(|| format!("run_adapter({})", &path_hint_copy.to_string_lossy()))
} }
fn compute_cache_key(
filepath_hint: &Path,
adapter: &dyn FileAdapter,
active_adapters: ActiveAdapters,
) -> Result<Vec<u8>> {
let clean_path = filepath_hint.to_owned().clean();
let meta = std::fs::metadata(filepath_hint)
.with_context(|| format!("reading metadata for {}", filepath_hint.to_string_lossy()))?;
let modified = meta.modified().expect("weird OS that can't into mtime");
if adapter.metadata().recurses {
let active_adapters_cache_key = active_adapters
.iter()
.map(|a| (a.metadata().name.clone(), a.metadata().version))
.collect::<Vec<_>>();
let key = (active_adapters_cache_key, clean_path, modified);
debug!("Cache key (with recursion): {:?}", key);
bincode::serialize(&key).context("could not serialize path")
} else {
let key = (
adapter.metadata().name.clone(),
adapter.metadata().version,
clean_path,
modified,
);
debug!("Cache key (no recursion): {:?}", key);
bincode::serialize(&key).context("could not serialize path")
}
}
async fn adapt_caching( async fn adapt_caching(
ai: AdaptInfo, ai: AdaptInfo,
adapter: Arc<dyn FileAdapter>, adapter: Arc<dyn FileAdapter>,
@ -166,41 +139,44 @@ async fn adapt_caching(
ai.filepath_hint.to_string_lossy(), ai.filepath_hint.to_string_lossy(),
&meta.name &meta.name
); );
let db_name = format!("{}.v{}", meta.name, meta.version);
let cache_compression_level = ai.config.cache.compression_level; let cache_compression_level = ai.config.cache.compression_level;
let cache_max_blob_len = ai.config.cache.max_blob_len; let cache_max_blob_len = ai.config.cache.max_blob_len;
let cache = if ai.is_real_file { let cache = if ai.is_real_file && !ai.config.cache.disabled {
LmdbCache::open(&ai.config.cache)? Some(open_cache_db(Path::new(&ai.config.cache.path.0)).await?)
} else { } else {
None None
}; };
let mut cache = cache.context("No cache?")?; let mut cache = cache.context("No cache?")?;
let cache_key: Vec<u8> = let cache_key = CacheKey::new(&ai.filepath_hint, adapter.as_ref(), &active_adapters)?;
compute_cache_key(&ai.filepath_hint, adapter.as_ref(), active_adapters)?;
// let dbg_ctx = format!("adapter {}", &adapter.metadata().name); // let dbg_ctx = format!("adapter {}", &adapter.metadata().name);
let cached = cache.get(&db_name, &cache_key)?; let cached = cache.get(&cache_key).await.context("cache.get")?;
match cached { match cached {
Some(cached) => Ok(Box::pin(ZstdDecoder::new(Cursor::new(cached)))), Some(cached) => Ok(Box::pin(ZstdDecoder::new(Cursor::new(cached)))),
None => { None => {
debug!("cache MISS, running adapter with caching..."); debug!("cache MISS, running adapter with caching...");
let inp = loop_adapt(adapter.as_ref(), detection_reason, ai)?; let inp = loop_adapt(adapter.as_ref(), detection_reason, ai).await?;
let inp = concat_read_streams(inp); let inp = concat_read_streams(inp);
let inp = async_read_and_write_to_cache( let inp = async_read_and_write_to_cache(
inp, inp,
cache_max_blob_len.0, cache_max_blob_len.0,
cache_compression_level.0, cache_compression_level.0,
Box::new(move |(uncompressed_size, compressed)| { Box::new(move |(uncompressed_size, compressed)| {
Box::pin(async move {
debug!( debug!(
"uncompressed output: {}", "uncompressed output: {}",
print_bytes(uncompressed_size as f64) print_bytes(uncompressed_size as f64)
); );
if let Some(cached) = compressed { if let Some(cached) = compressed {
debug!("compressed output: {}", print_bytes(cached.len() as f64)); debug!("compressed output: {}", print_bytes(cached.len() as f64));
cache.set(&db_name, &cache_key, &cached)? cache
.set(&cache_key, cached)
.await
.context("writing to cache")?
} }
Ok(()) Ok(())
})
}), }),
)?; )?;
@ -213,21 +189,34 @@ pub fn loop_adapt(
adapter: &dyn FileAdapter, adapter: &dyn FileAdapter,
detection_reason: FileMatcher, detection_reason: FileMatcher,
ai: AdaptInfo, ai: AdaptInfo,
) -> Pin<Box<dyn Future<Output = anyhow::Result<AdaptedFilesIterBox>> + Send + '_>> {
Box::pin(async move { loop_adapt_inner(adapter, detection_reason, ai).await })
}
pub async fn loop_adapt_inner(
adapter: &dyn FileAdapter,
detection_reason: FileMatcher,
ai: AdaptInfo,
) -> anyhow::Result<AdaptedFilesIterBox> { ) -> anyhow::Result<AdaptedFilesIterBox> {
let fph = ai.filepath_hint.clone(); let fph = ai.filepath_hint.clone();
let inp = adapter.adapt(ai, &detection_reason).with_context(|| { let inp = adapter.adapt(ai, &detection_reason).await;
let inp = if adapter.metadata().name == "postprocprefix" {
// don't add confusing error context
inp?
} else {
inp.with_context(|| {
format!( format!(
"adapting {} via {} failed", "adapting {} via {} failed",
fph.to_string_lossy(), fph.to_string_lossy(),
adapter.metadata().name adapter.metadata().name
) )
})?; })?
};
let s = stream! { let s = stream! {
for await file in inp { for await file in inp {
match buf_choose_adapter(file?).await? { match buf_choose_adapter(file?).await? {
Ret::Recurse(ai, adapter, detection_reason, _active_adapters) => { Ret::Recurse(ai, adapter, detection_reason, _active_adapters) => {
if ai.archive_recursion_depth >= ai.config.max_archive_recursion.0 { if ai.archive_recursion_depth >= ai.config.max_archive_recursion.0 {
let s = format!("{}[rga: max archive recursion reached ({})]", ai.line_prefix, ai.archive_recursion_depth).into_bytes(); let s = format!("{}[rga: max archive recursion reached ({})]\n", ai.line_prefix, ai.archive_recursion_depth).into_bytes();
yield Ok(AdaptInfo { yield Ok(AdaptInfo {
inp: Box::pin(Cursor::new(s)), inp: Box::pin(Cursor::new(s)),
..ai ..ai
@ -243,7 +232,7 @@ pub fn loop_adapt(
ai.filepath_hint.to_string_lossy(), ai.filepath_hint.to_string_lossy(),
&adapter.metadata().name &adapter.metadata().name
); );
for await ifile in loop_adapt(adapter.as_ref(), detection_reason, ai)? { for await ifile in loop_adapt(adapter.as_ref(), detection_reason, ai).await? {
yield ifile; yield ifile;
} }
} }

View File

@ -1,135 +1,188 @@
use crate::{config::CacheConfig, print_bytes, print_dur}; use crate::{adapters::FileAdapter, preproc::ActiveAdapters};
use anyhow::{format_err, Context, Result}; use anyhow::{Context, Result};
use log::*; use path_clean::PathClean;
use rkv::backend::{BackendEnvironmentBuilder, LmdbEnvironment}; use rusqlite::{named_params, OptionalExtension};
use std::{fmt::Display, path::Path, time::Instant}; use std::{path::Path, time::UNIX_EPOCH};
use tokio_rusqlite::Connection;
pub trait PreprocCache: Send + Sync { #[derive(Clone)]
/*/// gets cache at specified key. pub struct CacheKey {
/// if cache hit, return the resulting data adapter: String,
/// else, run the given lambda, and store its result in the cache if present adapter_version: i32,
fn get_or_run<'a>( active_adapters: String,
&mut self, file_path: String,
db_name: &str, file_mtime_unix_ms: i64,
key: &[u8],
debug_name: &str,
runner: Box<dyn FnOnce() -> Result<Option<Vec<u8>>> + 'a>,
) -> Result<Option<Vec<u8>>>;*/
fn get(&self, db_name: &str, key: &[u8]) -> Result<Option<Vec<u8>>>;
fn set(&mut self, db_name: &str, key: &[u8], value: &[u8]) -> Result<()>;
} }
impl CacheKey {
/// opens a LMDB cache pub fn new(
fn open_cache_db( filepath_hint: &Path,
path: &Path, adapter: &dyn FileAdapter,
) -> Result<std::sync::Arc<std::sync::RwLock<rkv::Rkv<LmdbEnvironment>>>> { active_adapters: &ActiveAdapters,
std::fs::create_dir_all(path)?; ) -> Result<CacheKey> {
// use rkv::backend::LmdbEnvironmentFlags; let meta = std::fs::metadata(filepath_hint)
.with_context(|| format!("reading metadata for {}", filepath_hint.to_string_lossy()))?;
rkv::Manager::<LmdbEnvironment>::singleton() let modified = meta.modified().expect("weird OS that can't into mtime");
.write() let file_mtime_unix_ms = modified.duration_since(UNIX_EPOCH)?.as_millis() as i64;
.map_err(|_| format_err!("could not write cache db manager"))? let active_adapters = if adapter.metadata().recurses {
.get_or_create(path, |p| { serde_json::to_string(
let mut builder = rkv::Rkv::environment_builder::<rkv::backend::Lmdb>(); &active_adapters
builder .iter()
.set_flags(rkv::EnvironmentFlags::NO_SYNC) .map(|a| format!("{}.v{}", a.metadata().name, a.metadata().version))
.set_flags(rkv::EnvironmentFlags::WRITE_MAP) // not durable cuz it's a cache .collect::<Vec<_>>(),
// i'm not sure why NO_TLS is needed. otherwise LMDB transactions (open readers) will keep piling up until it fails with )?
// LmdbError(ReadersFull). Those "open readers" stay even after the corresponding processes exit. } else {
// hope setting this doesn't break integrity "null".to_string()
.set_flags(rkv::EnvironmentFlags::NO_TLS) };
// sometimes, this seems to cause the data.mdb file to appear as 2GB in size (with holes), but sometimes not? Ok(CacheKey {
.set_map_size(2 * 1024 * 1024 * 1024) adapter: adapter.metadata().name.clone(),
.set_max_dbs(100) adapter_version: adapter.metadata().version,
.set_max_readers(128); file_path: filepath_hint.clean().to_string_lossy().to_string(),
rkv::Rkv::from_builder(p, builder) file_mtime_unix_ms,
active_adapters,
}) })
.map_err(|e| format_err!("could not get/create cache db: {}", e))
}
pub struct LmdbCache {
db_arc: std::sync::Arc<std::sync::RwLock<rkv::Rkv<LmdbEnvironment>>>,
}
impl LmdbCache {
pub fn open(config: &CacheConfig) -> Result<Option<LmdbCache>> {
if config.disabled {
return Ok(None);
}
let path = Path::new(&config.path.0);
Ok(Some(LmdbCache {
db_arc: open_cache_db(path)?,
}))
} }
} }
#[derive(Debug)] #[async_trait::async_trait]
struct RkvErrWrap(rkv::StoreError); pub trait PreprocCache {
impl Display for RkvErrWrap { async fn get(&self, key: &CacheKey) -> Result<Option<Vec<u8>>>;
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { async fn set(&mut self, key: &CacheKey, value: Vec<u8>) -> Result<()>;
self.0.fmt(f) }
async fn connect_pragmas(db: &Connection) -> Result<()> {
// https://phiresky.github.io/blog/2020/sqlite-performance-tuning/
//let want_page_size = 32768;
//db.execute(&format!("pragma page_size = {};", want_page_size))
// .context("setup pragma 1")?;
db.call(|db| {
db.execute_batch(
"
pragma journal_mode = WAL;
pragma foreign_keys = on;
pragma temp_store = memory;
pragma synchronous = off; -- integrity isn't very important here
pragma mmap_size = 30000000000;
create table if not exists preproc_cache (
adapter text not null,
adapter_version integer not null,
created_unix_ms integer not null default (unixepoch() * 1000),
active_adapters text not null, -- 'null' if adapter cannot recurse
file_path text not null,
file_mtime_unix_ms integer not null,
text_content_zstd blob not null
) strict;
create unique index if not exists preproc_cache_idx on preproc_cache (adapter, adapter_version, file_path, active_adapters);
",
)
})
.await.context("connect_pragmas")?;
let jm: i64 = db
.call(|db| db.pragma_query_value(None, "application_id", |r| r.get(0)))
.await?;
if jm != 924716026 {
// (probably) newly created db
create_pragmas(db).await.context("create_pragmas")?;
}
Ok(())
}
async fn create_pragmas(db: &Connection) -> Result<()> {
db.call(|db| {
db.execute_batch(
"
pragma application_id = 924716026;
pragma user_version = 2; -- todo: on upgrade clear db if version is unexpected
",
)
})
.await?;
Ok(())
}
struct SqliteCache {
db: Connection,
}
impl SqliteCache {
async fn new(path: &Path) -> Result<SqliteCache> {
let db = Connection::open(path.join("cache.sqlite3")).await?;
connect_pragmas(&db).await?;
Ok(SqliteCache { db })
} }
} }
impl std::error::Error for RkvErrWrap {}
impl PreprocCache for LmdbCache { #[async_trait::async_trait]
fn get(&self, db_name: &str, key: &[u8]) -> Result<Option<Vec<u8>>> { impl PreprocCache for SqliteCache {
let start = Instant::now(); async fn get(&self, key: &CacheKey) -> Result<Option<Vec<u8>>> {
let db_env = self let key = (*key).clone(); // todo: without cloning
.db_arc Ok(self
.read() .db
.map_err(|_| anyhow::anyhow!("Could not open lock, some lock writer panicked"))?; .call(move |db| {
let db = db_env db.query_row(
.open_single(db_name, rkv::store::Options::create()) "select text_content_zstd from preproc_cache where
.map_err(RkvErrWrap) adapter = :adapter
.context("could not open cache db store")?; and adapter_version = :adapter_version
and active_adapters = :active_adapters
let reader = db_env.read().expect("could not get reader"); and file_path = :file_path
let cached = db and file_mtime_unix_ms = :file_mtime_unix_ms
.get(&reader, key) ",
.map_err(RkvErrWrap) named_params! {
.context("could not read from db")?; ":adapter": &key.adapter,
":adapter_version": &key.adapter_version,
match cached { ":active_adapters": &key.active_adapters,
Some(rkv::Value::Blob(cached)) => { ":file_path": &key.file_path,
debug!( ":file_mtime_unix_ms": &key.file_mtime_unix_ms
"cache HIT, reading {} (compressed) from cache", },
print_bytes(cached.len() as f64) |r| r.get::<_, Vec<u8>>(0),
); )
debug!("reading from cache took {}", print_dur(start)); .optional()
Ok(Some(Vec::from(cached))) })
.await
.context("reading from cache")?)
} }
Some(_) => Err(format_err!("Integrity: value not blob"))?,
None => Ok(None), async fn set(&mut self, key: &CacheKey, value: Vec<u8>) -> Result<()> {
let key = (*key).clone(); // todo: without cloning
Ok(self
.db
.call(move |db| {
db.execute(
"insert into preproc_cache (adapter, adapter_version, active_adapters, file_path, file_mtime_unix_ms, text_content_zstd) values
(:adapter, :adapter_version, :active_adapters, :file_path, :file_mtime_unix_ms, :text_content_zstd)
on conflict (adapter, adapter_version, active_adapters, file_path) do update set
file_mtime_unix_ms = :file_mtime_unix_ms,
created_unix_ms = unixepoch() * 1000,
text_content_zstd = :text_content_zstd",
named_params! {
":adapter": &key.adapter,
":adapter_version": &key.adapter_version,
":active_adapters": &key.active_adapters,
":file_path": &key.file_path,
":file_mtime_unix_ms": &key.file_mtime_unix_ms,
":text_content_zstd": value
}
).map(|_| ())
})
.await?)
} }
} }
fn set(&mut self, db_name: &str, key: &[u8], got: &[u8]) -> Result<()> { /// opens a default cache
let start = Instant::now(); pub async fn open_cache_db(path: &Path) -> Result<impl PreprocCache> {
debug!("writing {} to cache", print_bytes(got.len() as f64)); std::fs::create_dir_all(path)?;
let db_env = self SqliteCache::new(path).await
.db_arc }
.read()
.map_err(|_| anyhow::anyhow!("Could not open lock, some lock writer panicked"))?;
let db = db_env #[cfg(test)]
.open_single(db_name, rkv::store::Options::create()) mod test {
.map_err(RkvErrWrap)
.context("could not open cache db store")?;
let mut writer = db_env use crate::preproc_cache::*;
.write()
.map_err(RkvErrWrap)
.with_context(|| format_err!("could not open write handle to cache"))?;
db.put(&mut writer, key, &rkv::Value::Blob(got)) #[tokio::test]
.map_err(RkvErrWrap) async fn test_read_write() -> anyhow::Result<()> {
.with_context(|| format_err!("could not write to cache"))?; let path = tempfile::tempdir()?;
writer let _db = open_cache_db(&path.path().join("foo.sqlite3")).await?;
.commit() // db.set();
.map_err(RkvErrWrap)
.with_context(|| "could not write cache".to_string())?;
debug!("writing to cache took {}", print_dur(start));
Ok(()) Ok(())
} }
} }