mirror of
https://github.com/FliegendeWurst/ripgrep-all.git
synced 2024-11-08 14:00:37 +00:00
Merge remote-tracking branch 'upstream/master' into mbox-extractor
This commit is contained in:
commit
2259730c67
27
.github/ISSUE_TEMPLATE/bug_report.md
vendored
Normal file
27
.github/ISSUE_TEMPLATE/bug_report.md
vendored
Normal file
@ -0,0 +1,27 @@
|
||||
---
|
||||
name: Bug report
|
||||
about: Create a report to help us improve
|
||||
title: ''
|
||||
labels: bug
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
**Describe the bug**
|
||||
|
||||
|
||||
**To Reproduce**
|
||||
|
||||
Attach example file:
|
||||
|
||||
Run command:
|
||||
|
||||
**Output**
|
||||
|
||||
**Screenshots**
|
||||
If applicable, add screenshots to help explain your problem.
|
||||
|
||||
**Operating System and Version**
|
||||
|
||||
|
||||
**Output of `rga --version`**
|
20
.github/ISSUE_TEMPLATE/feature_request.md
vendored
Normal file
20
.github/ISSUE_TEMPLATE/feature_request.md
vendored
Normal file
@ -0,0 +1,20 @@
|
||||
---
|
||||
name: Feature request
|
||||
about: Suggest an idea for this project
|
||||
title: ''
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
**Is your feature request related to a problem? Please describe.**
|
||||
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
|
||||
|
||||
**Describe the solution you'd like**
|
||||
A clear and concise description of what you want to happen.
|
||||
|
||||
**Describe alternatives you've considered**
|
||||
A clear and concise description of any alternative solutions or features you've considered.
|
||||
|
||||
**Additional context**
|
||||
Add any other context or screenshots about the feature request here.
|
72
.github/workflows/ci.yml
vendored
72
.github/workflows/ci.yml
vendored
@ -1,75 +1,25 @@
|
||||
# Based on https://github.com/actions-rs/meta/blob/master/recipes/quickstart.md
|
||||
#
|
||||
# While our "example" application has the platform-specific code,
|
||||
# for simplicity we are compiling and testing everything on the Ubuntu environment only.
|
||||
# For multi-OS testing see the `cross.yml` workflow.
|
||||
# While our "example" application has platform-specific code,
|
||||
# for simplicity we are compiling and testing everything in a nix-on-Linux environment only.
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
name: ci
|
||||
|
||||
jobs:
|
||||
check:
|
||||
name: Check
|
||||
nix-flake-check:
|
||||
name: nix flake check
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout sources
|
||||
uses: actions/checkout@v2
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Install stable toolchain
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
profile: minimal
|
||||
toolchain: stable
|
||||
override: true
|
||||
- name: Install nix
|
||||
uses: cachix/install-nix-action@v21
|
||||
|
||||
- name: Run cargo check
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
command: check
|
||||
- name: Ensure the build succeeds
|
||||
run: nix build
|
||||
|
||||
test:
|
||||
name: Test Suite
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout sources
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Install stable toolchain
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
profile: minimal
|
||||
toolchain: stable
|
||||
override: true
|
||||
|
||||
- name: Run cargo test
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
command: test
|
||||
|
||||
lints:
|
||||
name: Lints
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout sources
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Install stable toolchain
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
profile: minimal
|
||||
toolchain: stable
|
||||
override: true
|
||||
components: rustfmt, clippy
|
||||
|
||||
- name: Run cargo fmt
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
command: fmt
|
||||
args: --all -- --check
|
||||
|
||||
- name: Run cargo clippy
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
command: clippy
|
||||
args: -- -D warnings
|
||||
- name: Run `nix flake check` to run formatters, linters, and tests
|
||||
run: nix flake check --print-build-logs
|
||||
|
210
.github/workflows/release.yml
vendored
210
.github/workflows/release.yml
vendored
@ -18,14 +18,14 @@ on:
|
||||
# branches:
|
||||
# - ag/release
|
||||
tags:
|
||||
- 'v[0-9]+.[0-9]+.[0-9]+'
|
||||
- "v[0-9]+.[0-9]+.[0-9]+*"
|
||||
jobs:
|
||||
create-release:
|
||||
name: create-release
|
||||
runs-on: ubuntu-latest
|
||||
# env:
|
||||
# Set to force version number, e.g., when no tag exists.
|
||||
# RG_VERSION: TEST-0.0.0
|
||||
# Set to force version number, e.g., when no tag exists.
|
||||
# RG_VERSION: TEST-0.0.0
|
||||
steps:
|
||||
- name: Create artifacts directory
|
||||
run: mkdir artifacts
|
||||
@ -62,7 +62,7 @@ jobs:
|
||||
|
||||
build-release:
|
||||
name: build-release
|
||||
needs: ['create-release']
|
||||
needs: ["create-release"]
|
||||
runs-on: ${{ matrix.os }}
|
||||
env:
|
||||
# For some builds, we use cross to test on 32-bit and big-endian
|
||||
@ -78,124 +78,124 @@ jobs:
|
||||
matrix:
|
||||
build: [linux, linux-arm, macos, win-msvc]
|
||||
include:
|
||||
- build: linux
|
||||
os: ubuntu-18.04
|
||||
rust: nightly
|
||||
target: x86_64-unknown-linux-musl
|
||||
- build: linux-arm
|
||||
os: ubuntu-18.04
|
||||
rust: nightly
|
||||
target: arm-unknown-linux-gnueabihf
|
||||
- build: macos
|
||||
os: macos-latest
|
||||
rust: nightly
|
||||
target: x86_64-apple-darwin
|
||||
- build: win-msvc
|
||||
os: windows-2019
|
||||
rust: nightly
|
||||
target: x86_64-pc-windows-msvc
|
||||
- build: linux
|
||||
os: ubuntu-22.04
|
||||
rust: nightly
|
||||
target: x86_64-unknown-linux-musl
|
||||
- build: linux-arm
|
||||
os: ubuntu-22.04
|
||||
rust: nightly
|
||||
target: arm-unknown-linux-gnueabihf
|
||||
- build: macos
|
||||
os: macos-latest
|
||||
rust: nightly
|
||||
target: x86_64-apple-darwin
|
||||
- build: win-msvc
|
||||
os: windows-2019
|
||||
rust: nightly
|
||||
target: x86_64-pc-windows-msvc
|
||||
#- build: win-gnu
|
||||
# os: windows-2019
|
||||
# rust: nightly-x86_64-gnu
|
||||
# target: x86_64-pc-windows-gnu
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 1
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Install packages (Ubuntu)
|
||||
if: matrix.os == 'ubuntu-18.04'
|
||||
run: |
|
||||
ci/ubuntu-install-packages
|
||||
- name: Install packages (Ubuntu)
|
||||
if: matrix.os == 'ubuntu-22.04'
|
||||
run: |
|
||||
ci/ubuntu-install-packages
|
||||
|
||||
- name: Install packages (macOS)
|
||||
if: matrix.os == 'macos-latest'
|
||||
run: |
|
||||
ci/macos-install-packages
|
||||
- name: Install packages (macOS)
|
||||
if: matrix.os == 'macos-latest'
|
||||
run: |
|
||||
ci/macos-install-packages
|
||||
|
||||
- name: Install Rust
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: ${{ matrix.rust }}
|
||||
profile: minimal
|
||||
override: true
|
||||
target: ${{ matrix.target }}
|
||||
- name: Install Rust
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: ${{ matrix.rust }}
|
||||
profile: minimal
|
||||
override: true
|
||||
target: ${{ matrix.target }}
|
||||
|
||||
- name: Use Cross
|
||||
# if: matrix.os != 'windows-2019'
|
||||
run: |
|
||||
cargo install cross
|
||||
echo "CARGO=cross" >> $GITHUB_ENV
|
||||
echo "TARGET_FLAGS=--target ${{ matrix.target }}" >> $GITHUB_ENV
|
||||
echo "TARGET_DIR=./target/${{ matrix.target }}" >> $GITHUB_ENV
|
||||
- name: Use Cross
|
||||
shell: bash
|
||||
run: |
|
||||
cargo install cross
|
||||
echo "CARGO=cross" >> $GITHUB_ENV
|
||||
echo "TARGET_FLAGS=--target ${{ matrix.target }}" >> $GITHUB_ENV
|
||||
echo "TARGET_DIR=./target/${{ matrix.target }}" >> $GITHUB_ENV
|
||||
|
||||
- name: Show command used for Cargo
|
||||
run: |
|
||||
echo "cargo command is: ${{ env.CARGO }}"
|
||||
echo "target flag is: ${{ env.TARGET_FLAGS }}"
|
||||
echo "target dir is: ${{ env.TARGET_DIR }}"
|
||||
- name: Show command used for Cargo
|
||||
run: |
|
||||
echo "cargo command is: ${{ env.CARGO }}"
|
||||
echo "target flag is: ${{ env.TARGET_FLAGS }}"
|
||||
echo "target dir is: ${{ env.TARGET_DIR }}"
|
||||
|
||||
- name: Get release download URL
|
||||
uses: actions/download-artifact@v1
|
||||
with:
|
||||
name: artifacts
|
||||
path: artifacts
|
||||
- name: Get release download URL
|
||||
uses: actions/download-artifact@v1
|
||||
with:
|
||||
name: artifacts
|
||||
path: artifacts
|
||||
|
||||
- name: Set release upload URL and release version
|
||||
shell: bash
|
||||
run: |
|
||||
echo "RELEASE_UPLOAD_URL=$(cat artifacts/release-upload-url)" >> $GITHUB_ENV
|
||||
echo "release upload url: $RELEASE_UPLOAD_URL"
|
||||
echo "RELEASE_VERSION=$(cat artifacts/release-version)" >> $GITHUB_ENV
|
||||
echo "release version: $RELEASE_VERSION"
|
||||
- name: Set release upload URL and release version
|
||||
shell: bash
|
||||
run: |
|
||||
echo "RELEASE_UPLOAD_URL=$(cat artifacts/release-upload-url)" >> $GITHUB_ENV
|
||||
echo "release upload url: $RELEASE_UPLOAD_URL"
|
||||
echo "RELEASE_VERSION=$(cat artifacts/release-version)" >> $GITHUB_ENV
|
||||
echo "release version: $RELEASE_VERSION"
|
||||
|
||||
- name: Build release binary
|
||||
run: ${{ env.CARGO }} build --verbose --release ${{ env.TARGET_FLAGS }}
|
||||
- name: Build release binary
|
||||
run: ${{ env.CARGO }} build --verbose --release ${{ env.TARGET_FLAGS }}
|
||||
|
||||
- name: Strip release binary (linux and macos)
|
||||
if: matrix.build == 'linux' || matrix.build == 'macos'
|
||||
run: |
|
||||
strip "target/${{ matrix.target }}/release/rga" "target/${{ matrix.target }}/release/rga-preproc"
|
||||
- name: Strip release binary (linux and macos)
|
||||
if: matrix.build == 'linux' || matrix.build == 'macos'
|
||||
run: |
|
||||
strip "target/${{ matrix.target }}/release/rga" "target/${{ matrix.target }}/release/rga-preproc"
|
||||
|
||||
- name: Strip release binary (arm)
|
||||
if: matrix.build == 'linux-arm'
|
||||
run: |
|
||||
docker run --rm -v \
|
||||
"$PWD/target:/target:Z" \
|
||||
rustembedded/cross:arm-unknown-linux-gnueabihf \
|
||||
arm-linux-gnueabihf-strip \
|
||||
/target/arm-unknown-linux-gnueabihf/release/rga \
|
||||
/target/arm-unknown-linux-gnueabihf/release/rga-preproc
|
||||
- name: Strip release binary (arm)
|
||||
if: matrix.build == 'linux-arm'
|
||||
run: |
|
||||
docker run --rm -v \
|
||||
"$PWD/target:/target:Z" \
|
||||
rustembedded/cross:arm-unknown-linux-gnueabihf \
|
||||
arm-linux-gnueabihf-strip \
|
||||
/target/arm-unknown-linux-gnueabihf/release/rga \
|
||||
/target/arm-unknown-linux-gnueabihf/release/rga-preproc
|
||||
|
||||
- name: Build archive
|
||||
shell: bash
|
||||
run: |
|
||||
staging="ripgrep_all-${{ env.RELEASE_VERSION }}-${{ matrix.target }}"
|
||||
mkdir -p "$staging"/doc
|
||||
- name: Build archive
|
||||
shell: bash
|
||||
run: |
|
||||
staging="ripgrep_all-${{ env.RELEASE_VERSION }}-${{ matrix.target }}"
|
||||
mkdir -p "$staging"/doc
|
||||
|
||||
cp {README.md,LICENSE.md} "$staging/"
|
||||
cp CHANGELOG.md "$staging/doc/"
|
||||
cp {README.md,LICENSE.md} "$staging/"
|
||||
cp CHANGELOG.md "$staging/doc/"
|
||||
|
||||
if [ "${{ matrix.os }}" = "windows-2019" ]; then
|
||||
cp "target/${{ matrix.target }}/release/rga.exe" "$staging/"
|
||||
cp "target/${{ matrix.target }}/release/rga-preproc.exe" "$staging/"
|
||||
7z a "$staging.zip" "$staging"
|
||||
echo "ASSET=$staging.zip" >> $GITHUB_ENV
|
||||
else
|
||||
cp "target/${{ matrix.target }}/release/rga" "$staging/"
|
||||
cp "target/${{ matrix.target }}/release/rga-preproc" "$staging/"
|
||||
tar czf "$staging.tar.gz" "$staging"
|
||||
echo "ASSET=$staging.tar.gz" >> $GITHUB_ENV
|
||||
fi
|
||||
if [ "${{ matrix.os }}" = "windows-2019" ]; then
|
||||
cp "target/${{ matrix.target }}/release/rga.exe" "$staging/"
|
||||
cp "target/${{ matrix.target }}/release/rga-preproc.exe" "$staging/"
|
||||
7z a "$staging.zip" "$staging"
|
||||
echo "ASSET=$staging.zip" >> $GITHUB_ENV
|
||||
else
|
||||
cp "target/${{ matrix.target }}/release/rga" "$staging/"
|
||||
cp "target/${{ matrix.target }}/release/rga-preproc" "$staging/"
|
||||
tar czf "$staging.tar.gz" "$staging"
|
||||
echo "ASSET=$staging.tar.gz" >> $GITHUB_ENV
|
||||
fi
|
||||
|
||||
- name: Upload release archive
|
||||
uses: actions/upload-release-asset@v1.0.1
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
with:
|
||||
upload_url: ${{ env.RELEASE_UPLOAD_URL }}
|
||||
asset_path: ${{ env.ASSET }}
|
||||
asset_name: ${{ env.ASSET }}
|
||||
asset_content_type: application/octet-stream
|
||||
- name: Upload release archive
|
||||
uses: actions/upload-release-asset@v1.0.1
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
with:
|
||||
upload_url: ${{ env.RELEASE_UPLOAD_URL }}
|
||||
asset_path: ${{ env.ASSET }}
|
||||
asset_name: ${{ env.ASSET }}
|
||||
asset_content_type: application/octet-stream
|
||||
|
1155
Cargo.lock
generated
1155
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
71
Cargo.toml
71
Cargo.toml
@ -2,7 +2,7 @@
|
||||
[package]
|
||||
authors = ["phiresky <phireskyde+git@gmail.com>"]
|
||||
description = "rga: ripgrep, but also search in PDFs, E-Books, Office documents, zip, tar.gz, etc."
|
||||
edition = "2018"
|
||||
edition = "2021"
|
||||
exclude = [
|
||||
"exampledir/*",
|
||||
]
|
||||
@ -11,56 +11,57 @@ license = "AGPL-3.0-or-later"
|
||||
name = "ripgrep_all"
|
||||
readme = "README.md"
|
||||
repository = "https://github.com/phiresky/ripgrep-all"
|
||||
version = "1.0.0-alpha.2"
|
||||
version = "1.0.0-alpha.5"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.32"
|
||||
async-compression = {version = "0.3.15", features = ["all", "all-algorithms", "tokio"]}
|
||||
async-stream = "0.3.3"
|
||||
async-trait = "0.1.64"
|
||||
async_zip = "0.0.9"
|
||||
bincode = "1.3.1"
|
||||
bytes = "1.2.1"
|
||||
clap = {version = "4.0.18", features = ["wrap_help"]}
|
||||
crossbeam = "0.8.1"
|
||||
crossbeam-channel = "0.5.1"
|
||||
derive_more = "0.99.9"
|
||||
anyhow = {version = "1.0.71", features = ["backtrace"]}
|
||||
async-compression = { version = "0.4.0", features = ["all", "all-algorithms", "tokio"] }
|
||||
async-stream = "0.3.5"
|
||||
async-trait = "0.1.68"
|
||||
async_zip = {version = "0.0.12", features = ["full"]}
|
||||
bincode = "1.3.3"
|
||||
bytes = "1.4.0"
|
||||
clap = {version = "4.3.0", features = ["wrap_help"]}
|
||||
crossbeam = "0.8.2"
|
||||
crossbeam-channel = "0.5.8"
|
||||
derive_more = "0.99.17"
|
||||
directories-next = "2.0.0"
|
||||
dyn-clonable = "0.9.0"
|
||||
dyn-clone = "1.0.2"
|
||||
encoding_rs = "0.8.24"
|
||||
dyn-clone = "1.0.11"
|
||||
encoding_rs = "0.8.32"
|
||||
encoding_rs_io = "0.1.7"
|
||||
env_logger = "0.9.0"
|
||||
glob = "0.3.0"
|
||||
env_logger = "0.10.0"
|
||||
glob = "0.3.1"
|
||||
json_comments = "0.2.1"
|
||||
lazy_static = "1.4.0"
|
||||
log = "0.4.11"
|
||||
log = "0.4.17"
|
||||
mailbox = "0.2.0"
|
||||
mailparse = "0.14.0"
|
||||
memchr = "2.3.3"
|
||||
memchr = "2.5.0"
|
||||
mime2ext = "0.1.52"
|
||||
paste = "1.0.0"
|
||||
path-clean = "0.1.0"
|
||||
paste = "1.0.12"
|
||||
path-clean = "1.0.1"
|
||||
pretty-bytes = "0.2.2"
|
||||
regex = "1.3.9"
|
||||
rkv = "0.17"
|
||||
rusqlite = {version = "0.28.0", features = ["vtab", "bundled"]}
|
||||
schemars = {version = "0.8.0-alpha-4", features = ["preserve_order"]}
|
||||
serde = {version = "1.0.115", features = ["derive"]}
|
||||
serde_json = "1.0.57"
|
||||
regex = "1.8.2"
|
||||
rusqlite = {version = "0.29.0", features = ["vtab", "bundled"]}
|
||||
schemars = {version = "0.8.12", features = ["preserve_order"]}
|
||||
serde = {version = "1.0.163", features = ["derive"]}
|
||||
serde_json = "1.0.96"
|
||||
size_format = "1.0.2"
|
||||
structopt = "0.3.17"
|
||||
tempfile = "3.1.0"
|
||||
tokio = {version = "1.21.2", features = ["full"]}
|
||||
tokio-stream = {version = "0.1.11", features = ["io-util", "tokio-util"]}
|
||||
structopt = "0.3.26"
|
||||
tempfile = "3.5.0"
|
||||
tokio = {version = "1.28.1", features = ["full"]}
|
||||
tokio-rusqlite = "0.4.0"
|
||||
tokio-stream = {version = "0.1.14", features = ["io-util", "tokio-util"]}
|
||||
tokio-tar = { git = "https://github.com/vorot93/tokio-tar", version = "0.3.0" }
|
||||
tokio-util = {version = "0.7.4", features = ["io", "full"]}
|
||||
tree_magic = {package = "tree_magic_mini", version = "3.0.0"}
|
||||
tokio-util = {version = "0.7.8", features = ["io", "full"]}
|
||||
tree_magic = {package = "tree_magic_mini", version = "3.0.3"}
|
||||
|
||||
[dev-dependencies]
|
||||
async-recursion = "1.0.0"
|
||||
ctor = "0.1.20"
|
||||
async-recursion = "1.0.4"
|
||||
ctor = "0.2.0"
|
||||
pretty_assertions = "1.3.0"
|
||||
tempfile = "3.5.0"
|
||||
tokio-test = "0.4.2"
|
||||
|
161
README.md
161
README.md
@ -33,45 +33,7 @@ demo/
|
||||
|
||||
![rga-fzf](doc/rga-fzf.gif)
|
||||
|
||||
You can use rga interactively via fzf. Add the following to your ~/.{bash,zsh}rc:
|
||||
|
||||
```bash
|
||||
rga-fzf() {
|
||||
RG_PREFIX="rga --files-with-matches"
|
||||
local file
|
||||
file="$(
|
||||
FZF_DEFAULT_COMMAND="$RG_PREFIX '$1'" \
|
||||
fzf --sort --preview="[[ ! -z {} ]] && rga --pretty --context 5 {q} {}" \
|
||||
--phony -q "$1" \
|
||||
--bind "change:reload:$RG_PREFIX {q}" \
|
||||
--preview-window="70%:wrap"
|
||||
)" &&
|
||||
echo "opening $file" &&
|
||||
xdg-open "$file"
|
||||
}
|
||||
```
|
||||
|
||||
And for your `~/.config/fish/config.fish`:
|
||||
```
|
||||
function rga-fzf
|
||||
set RG_PREFIX 'rga --files-with-matches'
|
||||
if test (count $argv) -gt 1
|
||||
set RG_PREFIX "$RG_PREFIX $argv[1..-2]"
|
||||
end
|
||||
set -l file $file
|
||||
set file (
|
||||
FZF_DEFAULT_COMMAND="$RG_PREFIX '$argv[-1]'" \
|
||||
fzf --sort \
|
||||
--preview='test ! -z {} && \
|
||||
rga --pretty --context 5 {q} {}' \
|
||||
--phony -q "$argv[-1]" \
|
||||
--bind "change:reload:$RG_PREFIX {q}" \
|
||||
--preview-window='50%:wrap'
|
||||
) && \
|
||||
echo "opening $file" && \
|
||||
open "$file"
|
||||
end
|
||||
```
|
||||
See [the wiki](https://github.com/phiresky/ripgrep-all/wiki/fzf-Integration) for instructions of integrating rga with fzf.
|
||||
|
||||
## INSTALLATION
|
||||
|
||||
@ -86,9 +48,11 @@ Linux x64, macOS and Windows binaries are available [in GitHub Releases][latestr
|
||||
`pacman -S ripgrep-all`.
|
||||
|
||||
#### Nix
|
||||
|
||||
`nix-env -iA nixpkgs.ripgrep-all`
|
||||
|
||||
#### Debian-based
|
||||
|
||||
download the [rga binary][latestrelease] and get the dependencies like this:
|
||||
|
||||
`apt install ripgrep pandoc poppler-utils ffmpeg`
|
||||
@ -117,7 +81,7 @@ If you get an error like `VCRUNTIME140.DLL could not be found`, you need to inst
|
||||
|
||||
To install the dependencies that are each not strictly necessary but very useful:
|
||||
|
||||
`brew install pandoc poppler tesseract ffmpeg`
|
||||
`brew install pandoc poppler ffmpeg`
|
||||
|
||||
### Compile from source
|
||||
|
||||
@ -131,58 +95,58 @@ rga should compile with stable Rust (v1.36.0+, check with `rustc --version`). To
|
||||
|
||||
## Available Adapters
|
||||
|
||||
rga works with _adapters_ that adapt various file formats. It comes with a few adapters integrated:
|
||||
|
||||
```
|
||||
rga --rga-list-adapters
|
||||
```
|
||||
|
||||
You can also add **custom adapters**. See [the wiki](https://github.com/phiresky/ripgrep-all/wiki) for more information.
|
||||
|
||||
<!-- this part generated by update-readme.sh -->
|
||||
|
||||
Adapters:
|
||||
|
||||
- **ffmpeg**
|
||||
Uses ffmpeg to extract video metadata/chapters and subtitles.
|
||||
Extensions: `.mkv`, `.mp4`, `.avi`
|
||||
- **pandoc**
|
||||
Uses pandoc to convert binary/unreadable text documents to plain markdown-like text
|
||||
Runs: pandoc --from= --to=plain --wrap=none --markdown-headings=atx
|
||||
Extensions: .epub, .odt, .docx, .fb2, .ipynb
|
||||
|
||||
* **pandoc**
|
||||
Uses pandoc to convert binary/unreadable text documents to plain markdown-like text.
|
||||
Extensions: `.epub`, `.odt`, `.docx`, `.fb2`, `.ipynb`
|
||||
- **poppler**
|
||||
Uses pdftotext (from poppler-utils) to extract plain text from PDF files
|
||||
Runs: pdftotext - -
|
||||
Extensions: .pdf
|
||||
Mime Types: application/pdf
|
||||
|
||||
- **poppler**
|
||||
Uses pdftotext (from poppler-utils) to extract plain text from PDF files.
|
||||
Extensions: `.pdf`
|
||||
Mime Types: `application/pdf`
|
||||
- **postprocpagebreaks**
|
||||
Adds the page number to each line for an input file that specifies page breaks as ascii page break character.
|
||||
Mainly to be used internally by the poppler adapter.
|
||||
Extensions: .asciipagebreaks
|
||||
|
||||
- **zip**
|
||||
Reads a zip file as a stream and recurses down into its contents.
|
||||
Extensions: `.zip`
|
||||
Mime Types: `application/zip`
|
||||
- **ffmpeg**
|
||||
Uses ffmpeg to extract video metadata/chapters, subtitles, lyrics, and other metadata
|
||||
Extensions: .mkv, .mp4, .avi, .mp3, .ogg, .flac, .webm
|
||||
|
||||
- **decompress**
|
||||
Reads compressed file as a stream and runs a different extractor on the contents.
|
||||
Extensions: `.tgz`, `.tbz`, `.tbz2`, `.gz`, `.bz2`, `.xz`, `.zst`
|
||||
Mime Types: `application/gzip`, `application/x-bzip`, `application/x-xz`, `application/zstd`
|
||||
- **zip**
|
||||
Reads a zip file as a stream and recurses down into its contents
|
||||
Extensions: .zip, .jar
|
||||
Mime Types: application/zip
|
||||
|
||||
- **tar**
|
||||
Reads a tar file as a stream and recurses down into its contents.
|
||||
Extensions: `.tar`
|
||||
- **decompress**
|
||||
Reads compressed file as a stream and runs a different extractor on the contents.
|
||||
Extensions: .tgz, .tbz, .tbz2, .gz, .bz2, .xz, .zst
|
||||
Mime Types: application/gzip, application/x-bzip, application/x-xz, application/zstd
|
||||
|
||||
* **sqlite**
|
||||
Uses sqlite bindings to convert sqlite databases into a simple plain text format.
|
||||
Extensions: `.db`, `.db3`, `.sqlite`, `.sqlite3`
|
||||
Mime Types: `application/x-sqlite3`
|
||||
- **tar**
|
||||
Reads a tar file as a stream and recurses down into its contents
|
||||
Extensions: .tar
|
||||
|
||||
The following adapters are disabled by default, and can be enabled using `--rga-adapters=+pdfpages,tesseract`:
|
||||
- **sqlite**
|
||||
Uses sqlite bindings to convert sqlite databases into a simple plain text format
|
||||
Extensions: .db, .db3, .sqlite, .sqlite3
|
||||
Mime Types: application/x-sqlite3
|
||||
|
||||
- **pdfpages**
|
||||
Converts a pdf to its individual pages as png files. Only useful in combination with tesseract.
|
||||
Extensions: `.pdf`
|
||||
Mime Types: `application/pdf`
|
||||
|
||||
- **tesseract**
|
||||
Uses tesseract to run OCR on images to make them searchable.
|
||||
May need `-j1` to prevent overloading the system.
|
||||
Make sure you have tesseract installed.
|
||||
Extensions: `.jpg`, `.png`
|
||||
The following adapters are disabled by default, and can be enabled using '--rga-adapters=+foo,bar':
|
||||
|
||||
## USAGE:
|
||||
|
||||
@ -202,6 +166,17 @@ The following adapters are disabled by default, and can be enabled using `--rga-
|
||||
> Detection is only done on the first 8KiB of the file, since we can\'t
|
||||
> always seek on the input (in archives).
|
||||
|
||||
**\--rga-no-cache**
|
||||
|
||||
> Disable caching of results
|
||||
>
|
||||
> By default, rga caches the extracted text, if it is small enough, to a
|
||||
> database in \${XDG*CACHE_DIR-\~/.cache}/ripgrep-all on Linux,
|
||||
> *\~/Library/Caches/ripgrep-all\_ on macOS, or
|
||||
> C:\\Users\\username\\AppData\\Local\\ripgrep-all on Windows. This way,
|
||||
> repeated searches on the same set of files will be much faster. If you
|
||||
> pass this flag, all caching will be disabled.
|
||||
|
||||
**-h**, **\--help**
|
||||
|
||||
> Prints help information
|
||||
@ -210,15 +185,9 @@ The following adapters are disabled by default, and can be enabled using `--rga-
|
||||
|
||||
> List all known adapters
|
||||
|
||||
**\--rga-no-cache**
|
||||
**\--rga-print-config-schema**
|
||||
|
||||
> Disable caching of results
|
||||
>
|
||||
> By default, rga caches the extracted text, if it is small enough, to a
|
||||
> database in \~/.cache/rga on Linux, _\~/Library/Caches/rga_ on macOS,
|
||||
> or C:\\Users\\username\\AppData\\Local\\rga on Windows. This way,
|
||||
> repeated searches on the same set of files will be much faster. If you
|
||||
> pass this flag, all caching will be disabled.
|
||||
> Print the JSON Schema of the configuration file
|
||||
|
||||
**\--rg-help**
|
||||
|
||||
@ -242,25 +211,32 @@ The following adapters are disabled by default, and can be enabled using `--rga-
|
||||
> use all default adapters except for bar and baz. \"+bar,baz\" means
|
||||
> use all default adapters and also bar and baz.
|
||||
|
||||
**\--rga-cache-compression-level=**\<cache-compression-level\>
|
||||
**\--rga-cache-compression-level=**\<compression-level\>
|
||||
|
||||
> ZSTD compression level to apply to adapter outputs before storing in
|
||||
> cache db
|
||||
>
|
||||
> Ranges from 1 - 22 \[default: 12\]
|
||||
|
||||
**\--rga-cache-max-blob-len=**\<cache-max-blob-len\>
|
||||
|
||||
> Max compressed size to cache
|
||||
>
|
||||
> Longest byte length (after compression) to store in cache. Longer
|
||||
> adapter outputs will not be cached and recomputed every time. Allowed
|
||||
> suffixes: k M G \[default: 2000000\]
|
||||
**\--rga-config-file=**\<config-file-path\>
|
||||
|
||||
**\--rga-max-archive-recursion=**\<max-archive-recursion\>
|
||||
|
||||
> Maximum nestedness of archives to recurse into \[default: 4\]
|
||||
|
||||
**\--rga-cache-max-blob-len=**\<max-blob-len\>
|
||||
|
||||
> Max compressed size to cache
|
||||
>
|
||||
> Longest byte length (after compression) to store in cache. Longer
|
||||
> adapter outputs will not be cached and recomputed every time.
|
||||
>
|
||||
> Allowed suffixes on command line: k M G \[default: 2000000\]
|
||||
|
||||
**\--rga-cache-path=**\<path\>
|
||||
|
||||
> Path to store cache db \[default: /home/phire/.cache/ripgrep-all\]
|
||||
|
||||
**-h** shows a concise overview, **\--help** shows more detail and
|
||||
advanced options.
|
||||
|
||||
@ -287,6 +263,7 @@ to debug the adapters.
|
||||
|
||||
You can use the provided [`flake.nix`](./flake.nix) to setup all build- and
|
||||
run-time dependencies:
|
||||
|
||||
1. Enable [Flakes](https://nixos.wiki/wiki/Flakes) in your Nix configuration.
|
||||
1. Add [`direnv`](https://direnv.net/) to your profile:
|
||||
`nix profile install nixpkgs#direnv`
|
||||
|
@ -7,7 +7,7 @@
|
||||
// https://github.com/phiresky/ripgrep-all/blob/master/doc/config.default.jsonc
|
||||
|
||||
// The config options are the same as the command line options,
|
||||
// but with --rga- prefix removed and - replaced with _.
|
||||
// but with --rga- prefix removed and - and . replaced with _.
|
||||
// e.g. --rga-no-cache becomes `"no_cache": true.
|
||||
// The only exception is the `custom_adapters` option, which can only be set in this file.
|
||||
|
||||
|
@ -5,7 +5,7 @@ content=$(
|
||||
<!-- this part generated by update-readme.sh -->
|
||||
$(cargo run --bin rga -- --rga-list-adapters)
|
||||
|
||||
$(help2man -N "cargo run --bin rga --" | pandoc -f man -t markdown --atx-headers | rg --multiline "## USAGE:(.|\n)*")
|
||||
$(help2man -N "cargo run --bin rga --" | pandoc -f man -t markdown --markdown-headings=atx | rg --multiline "## USAGE:(.|\n)*")
|
||||
<!-- end of part generated by update-readme.sh -->
|
||||
END
|
||||
)
|
||||
|
BIN
exampledir/exif.png
Normal file
BIN
exampledir/exif.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.9 MiB |
142
flake.lock
142
flake.lock
@ -3,11 +3,11 @@
|
||||
"advisory-db": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"lastModified": 1670452192,
|
||||
"narHash": "sha256-f8NIFbqSbCzpssgDUK4srfgKaVaMhDScEptw4uuxGAc=",
|
||||
"lastModified": 1685821301,
|
||||
"narHash": "sha256-4XRcnSboLJw1XKjDpg2jBU70jEw/8Bgx4nUmnq3kXbY=",
|
||||
"owner": "rustsec",
|
||||
"repo": "advisory-db",
|
||||
"rev": "0a2faeb87195392b23333a8097309d29f2c5d31d",
|
||||
"rev": "af3f3d503f82056785841bee49997bae65eba1c0",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@ -26,11 +26,11 @@
|
||||
"rust-overlay": "rust-overlay"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1670546681,
|
||||
"narHash": "sha256-S33bhME0zPHPEZyZPCsrdQL/4WW/A020PwN+a3z7Q+I=",
|
||||
"lastModified": 1684981077,
|
||||
"narHash": "sha256-68X9cFm0RTZm8u0rXPbeBzOVUH5OoUGAfeHHVoxGd9o=",
|
||||
"owner": "ipetkov",
|
||||
"repo": "crane",
|
||||
"rev": "63f80ee278897e72a1468090278716b5befa5128",
|
||||
"rev": "35110cccf28823320f4fd697fcafcb5038683982",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@ -42,11 +42,11 @@
|
||||
"flake-compat": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"lastModified": 1668681692,
|
||||
"narHash": "sha256-Ht91NGdewz8IQLtWZ9LCeNXMSXHUss+9COoqu6JLmXU=",
|
||||
"lastModified": 1673956053,
|
||||
"narHash": "sha256-4gtG9iQuiKITOjNQQeQIpoIB6b16fm+504Ch3sNKLd8=",
|
||||
"owner": "edolstra",
|
||||
"repo": "flake-compat",
|
||||
"rev": "009399224d5e398d03b22badca40a37ac85412a1",
|
||||
"rev": "35bb57c0c8d8b62bbfd284272c928ceb64ddbde9",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@ -58,11 +58,11 @@
|
||||
"flake-compat_2": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"lastModified": 1668681692,
|
||||
"narHash": "sha256-Ht91NGdewz8IQLtWZ9LCeNXMSXHUss+9COoqu6JLmXU=",
|
||||
"lastModified": 1673956053,
|
||||
"narHash": "sha256-4gtG9iQuiKITOjNQQeQIpoIB6b16fm+504Ch3sNKLd8=",
|
||||
"owner": "edolstra",
|
||||
"repo": "flake-compat",
|
||||
"rev": "009399224d5e398d03b22badca40a37ac85412a1",
|
||||
"rev": "35bb57c0c8d8b62bbfd284272c928ceb64ddbde9",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@ -72,12 +72,15 @@
|
||||
}
|
||||
},
|
||||
"flake-utils": {
|
||||
"inputs": {
|
||||
"systems": "systems"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1667395993,
|
||||
"narHash": "sha256-nuEHfE/LcWyuSWnS8t12N1wc105Qtau+/OdUAjtQ0rA=",
|
||||
"lastModified": 1681202837,
|
||||
"narHash": "sha256-H+Rh19JDwRtpVPAWp64F+rlEtxUWBAQW28eAi3SRSzg=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "5aed5285a952e0b949eb3ba02c12fa4fcfef535f",
|
||||
"rev": "cfacdce06f30d2b68473a46042957675eebb3401",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@ -87,27 +90,15 @@
|
||||
}
|
||||
},
|
||||
"flake-utils_2": {
|
||||
"locked": {
|
||||
"lastModified": 1667395993,
|
||||
"narHash": "sha256-nuEHfE/LcWyuSWnS8t12N1wc105Qtau+/OdUAjtQ0rA=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "5aed5285a952e0b949eb3ba02c12fa4fcfef535f",
|
||||
"type": "github"
|
||||
"inputs": {
|
||||
"systems": "systems_2"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"flake-utils_3": {
|
||||
"locked": {
|
||||
"lastModified": 1667395993,
|
||||
"narHash": "sha256-nuEHfE/LcWyuSWnS8t12N1wc105Qtau+/OdUAjtQ0rA=",
|
||||
"lastModified": 1685518550,
|
||||
"narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "5aed5285a952e0b949eb3ba02c12fa4fcfef535f",
|
||||
"rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@ -139,48 +130,31 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1670525689,
|
||||
"narHash": "sha256-YIjGzxrRQa5LYO0zlnH/ardcwXsRgsnHe3TkGkvCxbc=",
|
||||
"lastModified": 1685860998,
|
||||
"narHash": "sha256-ZexAPe8yvJaLvn5aVgjW0vY41RnmJGbgOdGBJk1yDIE=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "f21f11aa2a02cb78651c6d57546c7d7541f9240c",
|
||||
"rev": "45d47b647d7bbaede5121d731cbee78f6093b6d6",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixpkgs-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs-stable": {
|
||||
"locked": {
|
||||
"lastModified": 1668984258,
|
||||
"narHash": "sha256-0gDMJ2T3qf58xgcSbYoXiRGUkPWmKyr5C3vcathWhKs=",
|
||||
"lastModified": 1678872516,
|
||||
"narHash": "sha256-/E1YwtMtFAu2KUQKV/1+KFuReYPANM2Rzehk84VxVoc=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "cf63ade6f74bbc9d2a017290f1b2e33e8fbfa70a",
|
||||
"rev": "9b8e5abb18324c7fe9f07cb100c3cd4a29cda8b8",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-22.05",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs_2": {
|
||||
"locked": {
|
||||
"lastModified": 1668994630,
|
||||
"narHash": "sha256-1lqx6HLyw6fMNX/hXrrETG1vMvZRGm2XVC9O/Jt0T6c=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "af50806f7c6ab40df3e6b239099e8f8385f6c78b",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-unstable",
|
||||
"ref": "nixos-22.11",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
@ -188,17 +162,21 @@
|
||||
"pre-commit-hooks": {
|
||||
"inputs": {
|
||||
"flake-compat": "flake-compat_2",
|
||||
"flake-utils": "flake-utils_3",
|
||||
"flake-utils": [
|
||||
"flake-utils"
|
||||
],
|
||||
"gitignore": "gitignore",
|
||||
"nixpkgs": "nixpkgs_2",
|
||||
"nixpkgs": [
|
||||
"nixpkgs"
|
||||
],
|
||||
"nixpkgs-stable": "nixpkgs-stable"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1670413394,
|
||||
"narHash": "sha256-M7sWqrKtOqUv9euX1t3HCxis8cPy9MNiZxQmUf0KF1o=",
|
||||
"lastModified": 1685361114,
|
||||
"narHash": "sha256-4RjrlSb+OO+e1nzTExKW58o3WRwVGpXwj97iCta8aj4=",
|
||||
"owner": "cachix",
|
||||
"repo": "pre-commit-hooks.nix",
|
||||
"rev": "1303a1a76e9eb074075bfe566518c413f6fc104e",
|
||||
"rev": "ca2fdbf3edda2a38140184da6381d49f8206eaf4",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@ -229,11 +207,11 @@
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1670034122,
|
||||
"narHash": "sha256-EqmuOKucPWtMvCZtHraHr3Q3bgVszq1x2PoZtQkUuEk=",
|
||||
"lastModified": 1683080331,
|
||||
"narHash": "sha256-nGDvJ1DAxZIwdn6ww8IFwzoHb2rqBP4wv/65Wt5vflk=",
|
||||
"owner": "oxalica",
|
||||
"repo": "rust-overlay",
|
||||
"rev": "a0d5773275ecd4f141d792d3a0376277c0fc0b65",
|
||||
"rev": "d59c3fa0cba8336e115b376c2d9e91053aa59e56",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@ -252,11 +230,11 @@
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1670552927,
|
||||
"narHash": "sha256-lCE51eAGrAFS4k9W5aDGFpVtOAwQQ/rFMN80PCDh0vo=",
|
||||
"lastModified": 1685846256,
|
||||
"narHash": "sha256-G4aYK4VqlMHImvZ0lUnLHw1A+Cx28T0sBMvAKZBcGpk=",
|
||||
"owner": "oxalica",
|
||||
"repo": "rust-overlay",
|
||||
"rev": "a0fdafd18c9cf599fde17fbaf07dbb20fa57eecb",
|
||||
"rev": "1ef3c6de6127a1cba94cc5492cdde52e33d06ea4",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@ -264,6 +242,36 @@
|
||||
"repo": "rust-overlay",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"systems": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"systems_2": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"type": "github"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
|
39
flake.nix
39
flake.nix
@ -3,7 +3,7 @@
|
||||
"ripgrep, but also search in PDFs, E-Books, Office documents, zip, tar.gz, etc.";
|
||||
|
||||
inputs = {
|
||||
nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
|
||||
nixpkgs.url = "github:NixOS/nixpkgs";
|
||||
|
||||
crane = {
|
||||
url = "github:ipetkov/crane";
|
||||
@ -25,7 +25,13 @@
|
||||
flake = false;
|
||||
};
|
||||
|
||||
pre-commit-hooks.url = "github:cachix/pre-commit-hooks.nix";
|
||||
pre-commit-hooks = {
|
||||
url = "github:cachix/pre-commit-hooks.nix";
|
||||
inputs = {
|
||||
nixpkgs.follows = "nixpkgs";
|
||||
flake-utils.follows = "flake-utils";
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
outputs = { self, nixpkgs, crane, flake-utils, rust-overlay, advisory-db
|
||||
@ -36,14 +42,16 @@
|
||||
inherit system;
|
||||
overlays = [ (import rust-overlay) ];
|
||||
};
|
||||
inherit (pkgs) lib;
|
||||
|
||||
craneLib = crane.lib.${system};
|
||||
src = craneLib.cleanCargoSource ./.;
|
||||
src = pkgs.lib.cleanSourceWith {
|
||||
src = craneLib.path ./.;
|
||||
filter = pkgs.lib.cleanSourceFilter;
|
||||
};
|
||||
|
||||
buildInputs = with pkgs;
|
||||
[ ffmpeg imagemagick pandoc poppler_utils ripgrep tesseract ]
|
||||
++ lib.optionals pkgs.stdenv.isDarwin [
|
||||
++ pkgs.lib.optionals pkgs.stdenv.isDarwin [
|
||||
# Additional darwin specific inputs can be set here
|
||||
pkgs.libiconv
|
||||
];
|
||||
@ -54,10 +62,7 @@
|
||||
|
||||
# Build the actual crate itself, reusing the dependency
|
||||
# artifacts from above.
|
||||
rga = craneLib.buildPackage {
|
||||
inherit cargoArtifacts src buildInputs;
|
||||
doCheck = false;
|
||||
};
|
||||
rga = craneLib.buildPackage { inherit cargoArtifacts src buildInputs; };
|
||||
|
||||
pre-commit = pre-commit-hooks.lib."${system}".run;
|
||||
in {
|
||||
@ -97,18 +102,20 @@
|
||||
hooks = {
|
||||
nixfmt.enable = true;
|
||||
rustfmt.enable = true;
|
||||
cargo-check.enable = true;
|
||||
typos = {
|
||||
enable = true;
|
||||
types = [ "text" ];
|
||||
excludes = [ "exampledir/.*" ];
|
||||
};
|
||||
};
|
||||
};
|
||||
} // lib.optionalAttrs (system == "x86_64-linux") {
|
||||
# NB: cargo-tarpaulin only supports x86_64 systems
|
||||
# Check code coverage (note: this will not upload coverage anywhere)
|
||||
rga-coverage =
|
||||
craneLib.cargoTarpaulin { inherit cargoArtifacts src; };
|
||||
};
|
||||
|
||||
# `nix build`
|
||||
packages.default = rga;
|
||||
packages = {
|
||||
inherit rga; # `nix build .#rga`
|
||||
default = rga; # `nix build`
|
||||
};
|
||||
|
||||
# `nix run`
|
||||
apps.default = flake-utils.lib.mkApp { drv = rga; };
|
||||
|
@ -10,6 +10,7 @@ pub mod writing;
|
||||
pub mod zip;
|
||||
use crate::{adapted_iter::AdaptedFilesIterBox, config::RgaConfig, matching::*};
|
||||
use anyhow::{format_err, Context, Result};
|
||||
use async_trait::async_trait;
|
||||
use custom::CustomAdapterConfig;
|
||||
use custom::BUILTIN_SPAWNING_ADAPTERS;
|
||||
use log::*;
|
||||
@ -77,11 +78,17 @@ impl AdapterMeta {
|
||||
pub trait GetMetadata {
|
||||
fn metadata(&self) -> &AdapterMeta;
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait FileAdapter: GetMetadata + Send + Sync {
|
||||
/// adapt a file.
|
||||
///
|
||||
/// detection_reason is the Matcher that was used to identify this file. Unless --rga-accurate was given, it is always a FastMatcher
|
||||
fn adapt(&self, a: AdaptInfo, detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox>;
|
||||
async fn adapt(
|
||||
&self,
|
||||
a: AdaptInfo,
|
||||
detection_reason: &FileMatcher,
|
||||
) -> Result<AdaptedFilesIterBox>;
|
||||
}
|
||||
|
||||
pub struct AdaptInfo {
|
||||
|
@ -49,8 +49,9 @@ pub struct CustomAdapterConfig {
|
||||
pub args: Vec<String>,
|
||||
/// The output path hint. The placeholders are the same as for `.args`
|
||||
///
|
||||
/// If not set, defaults to ${input_virtual_path}.txt
|
||||
/// If not set, defaults to "${input_virtual_path}.txt"
|
||||
///
|
||||
/// Setting this is useful if the output format is not plain text (.txt) but instead some other format that should be passed to another adapter
|
||||
pub output_path_hint: Option<String>,
|
||||
}
|
||||
|
||||
@ -128,7 +129,6 @@ lazy_static! {
|
||||
disabled_by_default: None,
|
||||
match_only_by_mime: None,
|
||||
output_path_hint: Some("${input_virtual_path}.txt.asciipagebreaks".into())
|
||||
// postprocessors: [{name: "add_page_numbers_by_pagebreaks"}]
|
||||
}
|
||||
];
|
||||
}
|
||||
@ -143,15 +143,13 @@ pub fn map_exe_error(err: std::io::Error, exe_name: &str, help: &str) -> anyhow:
|
||||
}
|
||||
}
|
||||
|
||||
fn proc_wait(mut child: Child) -> impl AsyncRead {
|
||||
fn proc_wait(mut child: Child, context: impl FnOnce() -> String) -> impl AsyncRead {
|
||||
let s = stream! {
|
||||
let res = child.wait().await?;
|
||||
if res.success() {
|
||||
yield std::io::Result::Ok(Bytes::new());
|
||||
} else {
|
||||
yield std::io::Result::Err(to_io_err(
|
||||
format_err!("subprocess failed: {:?}", res),
|
||||
));
|
||||
Err(format_err!("{:?}", res)).with_context(context).map_err(to_io_err)?;
|
||||
}
|
||||
};
|
||||
StreamReader::new(s)
|
||||
@ -164,6 +162,7 @@ pub fn pipe_output(
|
||||
exe_name: &str,
|
||||
help: &str,
|
||||
) -> Result<ReadBox> {
|
||||
let cmd_log = format!("{:?}", cmd); // todo: perf
|
||||
let mut cmd = cmd
|
||||
.stdin(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
@ -177,10 +176,9 @@ pub fn pipe_output(
|
||||
tokio::io::copy(&mut z, &mut stdi).await?;
|
||||
std::io::Result::Ok(())
|
||||
});
|
||||
|
||||
Ok(Box::pin(
|
||||
stdo.chain(proc_wait(cmd).chain(join_handle_to_stream(join))),
|
||||
))
|
||||
Ok(Box::pin(stdo.chain(
|
||||
proc_wait(cmd, move || format!("subprocess: {cmd_log}")).chain(join_handle_to_stream(join)),
|
||||
)))
|
||||
}
|
||||
|
||||
pub struct CustomSpawningFileAdapter {
|
||||
@ -224,8 +222,9 @@ impl CustomSpawningFileAdapter {
|
||||
Ok(command)
|
||||
}
|
||||
}
|
||||
#[async_trait]
|
||||
impl FileAdapter for CustomSpawningFileAdapter {
|
||||
fn adapt<'a>(
|
||||
async fn adapt(
|
||||
&self,
|
||||
ai: AdaptInfo,
|
||||
_detection_reason: &FileMatcher,
|
||||
@ -314,7 +313,7 @@ mod test {
|
||||
|
||||
let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
|
||||
// let r = adapter.adapt(a, &d)?;
|
||||
let r = loop_adapt(&adapter, d, a)?;
|
||||
let r = loop_adapt(&adapter, d, a).await?;
|
||||
let o = adapted_to_vec(r).await?;
|
||||
assert_eq!(
|
||||
String::from_utf8(o)?,
|
||||
@ -368,7 +367,7 @@ PREFIX:Page 1:
|
||||
Path::new("foo.txt"),
|
||||
Box::pin(Cursor::new(Vec::from(input))),
|
||||
);
|
||||
let output = adapter.adapt(a, &d).unwrap();
|
||||
let output = adapter.adapt(a, &d).await.unwrap();
|
||||
|
||||
let oup = adapted_to_vec(output).await?;
|
||||
println!("output: {}", String::from_utf8_lossy(&oup));
|
||||
|
@ -93,8 +93,13 @@ fn get_inner_filename(filename: &Path) -> PathBuf {
|
||||
filename.with_file_name(format!("{}{}", stem, new_extension))
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl FileAdapter for DecompressAdapter {
|
||||
fn adapt(&self, ai: AdaptInfo, detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox> {
|
||||
async fn adapt(
|
||||
&self,
|
||||
ai: AdaptInfo,
|
||||
detection_reason: &FileMatcher,
|
||||
) -> Result<AdaptedFilesIterBox> {
|
||||
Ok(one_file(AdaptInfo {
|
||||
filepath_hint: get_inner_filename(&ai.filepath_hint),
|
||||
is_real_file: false,
|
||||
@ -137,7 +142,7 @@ mod tests {
|
||||
let filepath = test_data_dir().join("hello.gz");
|
||||
|
||||
let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
|
||||
let r = adapter.adapt(a, &d)?;
|
||||
let r = adapter.adapt(a, &d).await?;
|
||||
let o = adapted_to_vec(r).await?;
|
||||
assert_eq!(String::from_utf8(o)?, "hello\n");
|
||||
Ok(())
|
||||
@ -150,7 +155,7 @@ mod tests {
|
||||
let filepath = test_data_dir().join("short.pdf.gz");
|
||||
|
||||
let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
|
||||
let r = loop_adapt(&adapter, d, a)?;
|
||||
let r = loop_adapt(&adapter, d, a).await?;
|
||||
let o = adapted_to_vec(r).await?;
|
||||
assert_eq!(
|
||||
String::from_utf8(o)?,
|
||||
|
@ -14,13 +14,15 @@ use writing::WritingFileAdapter;
|
||||
// maybe todo: read list of extensions from
|
||||
// ffmpeg -demuxers | tail -n+5 | awk '{print $2}' | while read demuxer; do echo MUX=$demuxer; ffmpeg -h demuxer=$demuxer | grep 'Common extensions'; done 2>/dev/null
|
||||
// but really, the probability of getting useful information from a .flv is low
|
||||
static EXTENSIONS: &[&str] = &["mkv", "mp4", "avi"];
|
||||
static EXTENSIONS: &[&str] = &["mkv", "mp4", "avi", "mp3", "ogg", "flac", "webm"];
|
||||
|
||||
lazy_static! {
|
||||
static ref METADATA: AdapterMeta = AdapterMeta {
|
||||
name: "ffmpeg".to_owned(),
|
||||
version: 1,
|
||||
description: "Uses ffmpeg to extract video metadata/chapters and subtitles".to_owned(),
|
||||
description:
|
||||
"Uses ffmpeg to extract video metadata/chapters, subtitles, lyrics, and other metadata"
|
||||
.to_owned(),
|
||||
recurses: false,
|
||||
fast_matchers: EXTENSIONS
|
||||
.iter()
|
||||
@ -52,7 +54,7 @@ struct FFprobeOutput {
|
||||
}
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct FFprobeStream {
|
||||
codec_type: String, // video,audio,subtitle
|
||||
index: i32, // stream index
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
@ -78,17 +80,17 @@ impl WritingFileAdapter for FFmpegAdapter {
|
||||
}
|
||||
let inp_fname = filepath_hint;
|
||||
let spawn_fail = |e| map_exe_error(e, "ffprobe", "Make sure you have ffmpeg installed.");
|
||||
let has_subtitles = {
|
||||
let subtitle_streams = {
|
||||
let probe = Command::new("ffprobe")
|
||||
.args(vec![
|
||||
"-v",
|
||||
"error",
|
||||
"error", // show all errors
|
||||
"-select_streams",
|
||||
"s",
|
||||
"s", // show only subtitle streams
|
||||
"-of",
|
||||
"json",
|
||||
"json", // use json as output format
|
||||
"-show_entries",
|
||||
"stream=codec_type",
|
||||
"stream=index", // show index of subtitle streams
|
||||
])
|
||||
.arg("-i")
|
||||
.arg(&inp_fname)
|
||||
@ -96,10 +98,14 @@ impl WritingFileAdapter for FFmpegAdapter {
|
||||
.await
|
||||
.map_err(spawn_fail)?;
|
||||
if !probe.status.success() {
|
||||
return Err(format_err!("ffprobe failed: {:?}", probe.status));
|
||||
return Err(format_err!(
|
||||
"ffprobe failed: {:?}\n{}",
|
||||
probe.status,
|
||||
String::from_utf8_lossy(&probe.stderr)
|
||||
));
|
||||
}
|
||||
let p: FFprobeOutput = serde_json::from_slice(&probe.stdout)?;
|
||||
!p.streams.is_empty()
|
||||
p.streams
|
||||
};
|
||||
{
|
||||
// extract file metadata (especially chapter names in a greppable format)
|
||||
@ -124,6 +130,7 @@ impl WritingFileAdapter for FFmpegAdapter {
|
||||
.spawn()?;
|
||||
let mut lines = BufReader::new(probe.stdout.as_mut().unwrap()).lines();
|
||||
while let Some(line) = lines.next_line().await? {
|
||||
let line = line.replace("\\r\\n", "\n").replace("\\n", "\n"); // just unescape newlines
|
||||
async_writeln!(oup, "metadata: {line}")?;
|
||||
}
|
||||
let exit = probe.wait().await?;
|
||||
@ -131,31 +138,35 @@ impl WritingFileAdapter for FFmpegAdapter {
|
||||
return Err(format_err!("ffprobe failed: {:?}", exit));
|
||||
}
|
||||
}
|
||||
if has_subtitles {
|
||||
// extract subtitles
|
||||
let mut cmd = Command::new("ffmpeg");
|
||||
cmd.arg("-hide_banner")
|
||||
.arg("-loglevel")
|
||||
.arg("panic")
|
||||
.arg("-i")
|
||||
.arg(&inp_fname)
|
||||
.arg("-f")
|
||||
.arg("webvtt")
|
||||
.arg("-");
|
||||
let mut cmd = cmd.stdout(Stdio::piped()).spawn().map_err(spawn_fail)?;
|
||||
let stdo = cmd.stdout.as_mut().expect("is piped");
|
||||
let time_re = Regex::new(r".*\d.*-->.*\d.*").unwrap();
|
||||
let mut time: String = "".to_owned();
|
||||
// rewrite subtitle times so they are shown as a prefix in every line
|
||||
let mut lines = BufReader::new(stdo).lines();
|
||||
while let Some(line) = lines.next_line().await? {
|
||||
// 09:55.195 --> 09:56.730
|
||||
if time_re.is_match(&line) {
|
||||
time = line.to_owned();
|
||||
} else if line.is_empty() {
|
||||
async_writeln!(oup)?;
|
||||
} else {
|
||||
async_writeln!(oup, "{time}: {line}")?;
|
||||
if !subtitle_streams.is_empty() {
|
||||
for probe_stream in subtitle_streams.iter() {
|
||||
// extract subtitles
|
||||
let mut cmd = Command::new("ffmpeg");
|
||||
cmd.arg("-hide_banner")
|
||||
.arg("-loglevel")
|
||||
.arg("panic")
|
||||
.arg("-i")
|
||||
.arg(&inp_fname)
|
||||
.arg("-map")
|
||||
.arg(format!("0:{}", probe_stream.index)) // 0 for first input
|
||||
.arg("-f")
|
||||
.arg("webvtt")
|
||||
.arg("-");
|
||||
let mut cmd = cmd.stdout(Stdio::piped()).spawn().map_err(spawn_fail)?;
|
||||
let stdo = cmd.stdout.as_mut().expect("is piped");
|
||||
let time_re = Regex::new(r".*\d.*-->.*\d.*").unwrap();
|
||||
let mut time: String = "".to_owned();
|
||||
// rewrite subtitle times so they are shown as a prefix in every line
|
||||
let mut lines = BufReader::new(stdo).lines();
|
||||
while let Some(line) = lines.next_line().await? {
|
||||
// 09:55.195 --> 09:56.730
|
||||
if time_re.is_match(&line) {
|
||||
time = line.to_owned();
|
||||
} else if line.is_empty() {
|
||||
async_writeln!(oup)?;
|
||||
} else {
|
||||
async_writeln!(oup, "{time}: {line}")?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -4,7 +4,11 @@
|
||||
|
||||
use anyhow::Result;
|
||||
use async_stream::stream;
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
use encoding_rs::Encoding;
|
||||
use encoding_rs_io::DecodeReaderBytesBuilder;
|
||||
use tokio_util::io::SyncIoBridge;
|
||||
|
||||
use std::io::Cursor;
|
||||
use std::path::PathBuf;
|
||||
@ -41,15 +45,16 @@ impl GetMetadata for PostprocPrefix {
|
||||
&METADATA
|
||||
}
|
||||
}
|
||||
#[async_trait]
|
||||
impl FileAdapter for PostprocPrefix {
|
||||
fn adapt<'a>(
|
||||
async fn adapt(
|
||||
&self,
|
||||
a: super::AdaptInfo,
|
||||
_detection_reason: &crate::matching::FileMatcher,
|
||||
) -> Result<AdaptedFilesIterBox> {
|
||||
let read = add_newline(postproc_prefix(
|
||||
&a.line_prefix,
|
||||
postproc_encoding(&a.line_prefix, a.inp)?,
|
||||
postproc_encoding(&a.line_prefix, a.inp).await?,
|
||||
));
|
||||
// keep adapt info (filename etc) except replace inp
|
||||
let ai = AdaptInfo {
|
||||
@ -74,50 +79,53 @@ impl Read for ReadErr {
|
||||
* Detects and converts encodings other than utf-8 to utf-8.
|
||||
* If the input stream does not contain valid text, returns the string `[rga: binary data]` instead
|
||||
*/
|
||||
pub fn postproc_encoding(
|
||||
async fn postproc_encoding(
|
||||
_line_prefix: &str,
|
||||
inp: impl AsyncRead + Send + 'static,
|
||||
inp: Pin<Box<dyn AsyncRead + Send>>,
|
||||
) -> Result<Pin<Box<dyn AsyncRead + Send>>> {
|
||||
Ok(Box::pin(inp))
|
||||
// panic!("todo: implement");
|
||||
/*// TODO: parse these options from ripgrep's configuration
|
||||
let encoding = None; // detect bom but usually assume utf8
|
||||
let bom_sniffing = true;
|
||||
let mut decode_builder = DecodeReaderBytesBuilder::new();
|
||||
// https://github.com/BurntSushi/ripgrep/blob/a7d26c8f144a4957b75f71087a66692d0b25759a/grep-searcher/src/searcher/mod.rs#L706
|
||||
// this detects utf-16 BOMs and transcodes to utf-8 if they are present
|
||||
// it does not detect any other char encodings. that would require https://github.com/hsivonen/chardetng or similar but then binary detection is hard (?)
|
||||
let inp = decode_builder
|
||||
.encoding(encoding)
|
||||
.utf8_passthru(true)
|
||||
.strip_bom(bom_sniffing)
|
||||
.bom_override(true)
|
||||
.bom_sniffing(bom_sniffing)
|
||||
.build(inp);
|
||||
|
||||
// check for binary content in first 8kB
|
||||
// read the first 8kB into a buffer, check for null bytes, then return the buffer concatenated with the rest of the file
|
||||
let mut fourk = Vec::with_capacity(1 << 13);
|
||||
let mut beginning = inp.take(1 << 13);
|
||||
|
||||
beginning.read_to_end(&mut fourk)?;
|
||||
beginning.read_to_end(&mut fourk).await?;
|
||||
let has_binary = fourk.contains(&0u8);
|
||||
|
||||
if fourk.contains(&0u8) {
|
||||
log::debug!("detected binary");
|
||||
let v = "[rga: binary data]";
|
||||
return Ok(Box::new(std::io::Cursor::new(v)));
|
||||
/*let err = std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
format!("{}[rga: binary data]", line_prefix),
|
||||
);
|
||||
return Err(err).context("");
|
||||
return ReadErr {
|
||||
err,
|
||||
};*/
|
||||
let enc = Encoding::for_bom(&fourk);
|
||||
let inp = Cursor::new(fourk).chain(beginning.into_inner());
|
||||
match enc {
|
||||
Some((enc, _)) if enc != encoding_rs::UTF_8 => {
|
||||
// detected UTF16LE or UTF16BE, convert to UTF8 in separate thread
|
||||
// TODO: parse these options from ripgrep's configuration
|
||||
let encoding = None; // detect bom but usually assume utf8
|
||||
let bom_sniffing = true;
|
||||
let mut decode_builder = DecodeReaderBytesBuilder::new();
|
||||
// https://github.com/BurntSushi/ripgrep/blob/a7d26c8f144a4957b75f71087a66692d0b25759a/grep-searcher/src/searcher/mod.rs#L706
|
||||
// this detects utf-16 BOMs and transcodes to utf-8 if they are present
|
||||
// it does not detect any other char encodings. that would require https://github.com/hsivonen/chardetng or similar but then binary detection is hard (?)
|
||||
let mut inp = decode_builder
|
||||
.encoding(encoding)
|
||||
.utf8_passthru(true)
|
||||
.strip_bom(bom_sniffing)
|
||||
.bom_override(true)
|
||||
.bom_sniffing(bom_sniffing)
|
||||
.build(SyncIoBridge::new(inp));
|
||||
let oup = tokio::task::spawn_blocking(move || -> Result<Vec<u8>> {
|
||||
let mut oup = Vec::new();
|
||||
std::io::Read::read_to_end(&mut inp, &mut oup)?;
|
||||
Ok(oup)
|
||||
})
|
||||
.await??;
|
||||
Ok(Box::pin(Cursor::new(oup)))
|
||||
}
|
||||
_ => {
|
||||
if has_binary {
|
||||
log::debug!("detected binary");
|
||||
return Ok(Box::pin(Cursor::new("[rga: binary data]")));
|
||||
}
|
||||
Ok(Box::pin(inp))
|
||||
}
|
||||
}
|
||||
Ok(Box::new(
|
||||
std::io::Cursor::new(fourk).chain(beginning.into_inner()),
|
||||
))*/
|
||||
}
|
||||
|
||||
/// Adds the given prefix to each line in an `AsyncRead`.
|
||||
@ -164,13 +172,14 @@ impl GetMetadata for PostprocPageBreaks {
|
||||
&METADATA
|
||||
}
|
||||
}
|
||||
#[async_trait]
|
||||
impl FileAdapter for PostprocPageBreaks {
|
||||
fn adapt<'a>(
|
||||
async fn adapt(
|
||||
&self,
|
||||
a: super::AdaptInfo,
|
||||
_detection_reason: &crate::matching::FileMatcher,
|
||||
) -> Result<AdaptedFilesIterBox> {
|
||||
let read = postproc_pagebreaks(postproc_encoding(&a.line_prefix, a.inp)?);
|
||||
let read = postproc_pagebreaks(postproc_encoding(&a.line_prefix, a.inp).await?);
|
||||
// keep adapt info (filename etc) except replace inp
|
||||
let ai = AdaptInfo {
|
||||
inp: Box::pin(read),
|
||||
@ -282,7 +291,7 @@ mod tests {
|
||||
let fname = test_data_dir().join("twoblankpages.pdf");
|
||||
let rd = File::open(&fname).await?;
|
||||
let (a, d) = simple_adapt_info(&fname, Box::pin(rd));
|
||||
let res = loop_adapt(&adapter, d, a)?;
|
||||
let res = loop_adapt(&adapter, d, a).await?;
|
||||
|
||||
let buf = adapted_to_vec(res).await?;
|
||||
|
||||
@ -327,7 +336,8 @@ PREFIX:Page 3:
|
||||
b: &str,
|
||||
) -> Result<()> {
|
||||
let mut oup = Vec::new();
|
||||
let inp = postproc_encoding("", a)?;
|
||||
let inp = Box::pin(Cursor::new(a));
|
||||
let inp = postproc_encoding("", inp).await?;
|
||||
if pagebreaks {
|
||||
postproc_pagebreaks(inp).read_to_end(&mut oup).await?;
|
||||
} else {
|
||||
@ -341,6 +351,23 @@ PREFIX:Page 3:
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_utf16() -> Result<()> {
|
||||
let utf16lebom: &[u8] = &[
|
||||
0xff, 0xfe, 0x68, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x20, 0x00,
|
||||
0x77, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x6c, 0x00, 0x64, 0x00, 0x20, 0x00, 0x3d, 0xd8,
|
||||
0xa9, 0xdc, 0x0a, 0x00,
|
||||
];
|
||||
let utf16bebom: &[u8] = &[
|
||||
0xfe, 0xff, 0x00, 0x68, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x20,
|
||||
0x00, 0x77, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x6c, 0x00, 0x64, 0x00, 0x20, 0xd8, 0x3d,
|
||||
0xdc, 0xa9, 0x00, 0x0a,
|
||||
];
|
||||
test_from_bytes(false, "", utf16lebom, "hello world 💩\n").await?;
|
||||
test_from_bytes(false, "", utf16bebom, "hello world 💩\n").await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn post1() -> Result<()> {
|
||||
let inp = "What is this\nThis is a test\nFoo";
|
||||
@ -362,20 +389,19 @@ PREFIX:Page 3:
|
||||
|
||||
Ok(())
|
||||
}
|
||||
/*
|
||||
todo: uncomment when fixed
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_binary_content() -> Result<()> {
|
||||
test_from_strs(
|
||||
false,
|
||||
"foo:",
|
||||
"this is a test \n\n \0 foo",
|
||||
"foo:[rga: binary data]",
|
||||
)
|
||||
.await?;
|
||||
test_from_strs(false, "foo:", "\0", "foo:[rga: binary data]").await?;
|
||||
Ok(())
|
||||
}*/
|
||||
async fn test_binary_content() -> Result<()> {
|
||||
test_from_strs(
|
||||
false,
|
||||
"foo:",
|
||||
"this is a test \n\n \0 foo",
|
||||
"foo:[rga: binary data]",
|
||||
)
|
||||
.await?;
|
||||
test_from_strs(false, "foo:", "\0", "foo:[rga: binary data]").await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/*#[test]
|
||||
fn chardet() -> Result<()> {
|
||||
|
@ -77,11 +77,13 @@ fn synchronous_dump_sqlite(ai: AdaptInfo, mut s: impl Write) -> Result<()> {
|
||||
return Ok(());
|
||||
}
|
||||
let inp_fname = filepath_hint;
|
||||
|
||||
let conn = Connection::open_with_flags(inp_fname, OpenFlags::SQLITE_OPEN_READ_ONLY)?;
|
||||
let conn = Connection::open_with_flags(&inp_fname, OpenFlags::SQLITE_OPEN_READ_ONLY)
|
||||
.with_context(|| format!("opening sqlite connection to {}", inp_fname.display()))?;
|
||||
let tables: Vec<String> = conn
|
||||
.prepare("select name from sqlite_master where type='table'")?
|
||||
.query_map([], |r| r.get::<_, String>(0))?
|
||||
.prepare("select name from sqlite_master where type='table'")
|
||||
.context("while preparing query")?
|
||||
.query_map([], |r| r.get::<_, String>(0))
|
||||
.context("while executing query")?
|
||||
.filter_map(|e| e.ok())
|
||||
.collect();
|
||||
debug!("db has {} tables", tables.len());
|
||||
@ -121,7 +123,9 @@ impl WritingFileAdapter for SqliteAdapter {
|
||||
oup: Pin<Box<dyn AsyncWrite + Send>>,
|
||||
) -> Result<()> {
|
||||
let oup_sync = SyncIoBridge::new(oup);
|
||||
tokio::task::spawn_blocking(|| synchronous_dump_sqlite(ai, oup_sync)).await??;
|
||||
tokio::task::spawn_blocking(|| synchronous_dump_sqlite(ai, oup_sync))
|
||||
.await?
|
||||
.context("in synchronous sqlite task")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@ -134,10 +138,10 @@ mod test {
|
||||
|
||||
#[tokio::test]
|
||||
async fn simple() -> Result<()> {
|
||||
let adapter: Box<dyn FileAdapter> = Box::new(SqliteAdapter::default());
|
||||
let adapter: Box<dyn FileAdapter> = Box::<SqliteAdapter>::default();
|
||||
let fname = test_data_dir().join("hello.sqlite3");
|
||||
let (a, d) = simple_fs_adapt_info(&fname).await?;
|
||||
let res = adapter.adapt(a, &d)?;
|
||||
let res = adapter.adapt(a, &d).await?;
|
||||
|
||||
let buf = adapted_to_vec(res).await?;
|
||||
|
||||
|
@ -6,6 +6,7 @@ use crate::{
|
||||
};
|
||||
use anyhow::*;
|
||||
use async_stream::stream;
|
||||
use async_trait::async_trait;
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use std::path::PathBuf;
|
||||
@ -45,8 +46,13 @@ impl GetMetadata for TarAdapter {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl FileAdapter for TarAdapter {
|
||||
fn adapt(&self, ai: AdaptInfo, _detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox> {
|
||||
async fn adapt(
|
||||
&self,
|
||||
ai: AdaptInfo,
|
||||
_detection_reason: &FileMatcher,
|
||||
) -> Result<AdaptedFilesIterBox> {
|
||||
let AdaptInfo {
|
||||
filepath_hint,
|
||||
inp,
|
||||
@ -103,7 +109,7 @@ mod tests {
|
||||
let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
|
||||
|
||||
let adapter = TarAdapter::new();
|
||||
let r = loop_adapt(&adapter, d, a).context("adapt")?;
|
||||
let r = loop_adapt(&adapter, d, a).await.context("adapt")?;
|
||||
let o = adapted_to_vec(r).await.context("adapted_to_vec")?;
|
||||
assert_eq!(
|
||||
String::from_utf8(o).context("parsing utf8")?,
|
||||
|
@ -3,7 +3,7 @@ use std::pin::Pin;
|
||||
use crate::{adapted_iter::one_file, join_handle_to_stream, to_io_err};
|
||||
|
||||
use super::{AdaptInfo, FileAdapter, GetMetadata};
|
||||
use anyhow::Result;
|
||||
use anyhow::{Context, Result};
|
||||
use async_trait::async_trait;
|
||||
use tokio::io::{AsyncReadExt, AsyncWrite};
|
||||
|
||||
@ -41,15 +41,17 @@ macro_rules! async_writeln {
|
||||
}
|
||||
pub(crate) use async_writeln;
|
||||
|
||||
#[async_trait]
|
||||
impl<T> FileAdapter for T
|
||||
where
|
||||
T: WritingFileAdapter,
|
||||
{
|
||||
fn adapt(
|
||||
async fn adapt(
|
||||
&self,
|
||||
a: super::AdaptInfo,
|
||||
detection_reason: &crate::matching::FileMatcher,
|
||||
) -> Result<crate::adapted_iter::AdaptedFilesIterBox> {
|
||||
let name = self.metadata().name.clone();
|
||||
let (w, r) = tokio::io::duplex(128 * 1024);
|
||||
let d2 = detection_reason.clone();
|
||||
let archive_recursion_depth = a.archive_recursion_depth + 1;
|
||||
@ -59,7 +61,10 @@ where
|
||||
let config = a.config.clone();
|
||||
let joiner = tokio::spawn(async move {
|
||||
let x = d2;
|
||||
T::adapt_write(a, &x, Box::pin(w)).await.map_err(to_io_err)
|
||||
T::adapt_write(a, &x, Box::pin(w))
|
||||
.await
|
||||
.with_context(|| format!("in {}.adapt_write", name))
|
||||
.map_err(to_io_err)
|
||||
});
|
||||
|
||||
Ok(one_file(AdaptInfo {
|
||||
|
@ -5,7 +5,7 @@ use async_stream::stream;
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
|
||||
static EXTENSIONS: &[&str] = &["zip"];
|
||||
static EXTENSIONS: &[&str] = &["zip", "jar"];
|
||||
|
||||
lazy_static! {
|
||||
static ref METADATA: AdapterMeta = AdapterMeta {
|
||||
@ -36,8 +36,13 @@ impl GetMetadata for ZipAdapter {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl FileAdapter for ZipAdapter {
|
||||
fn adapt(&self, ai: AdaptInfo, _detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox> {
|
||||
async fn adapt(
|
||||
&self,
|
||||
ai: AdaptInfo,
|
||||
_detection_reason: &FileMatcher,
|
||||
) -> Result<AdaptedFilesIterBox> {
|
||||
// let (s, r) = mpsc::channel(1);
|
||||
let AdaptInfo {
|
||||
inp,
|
||||
@ -52,11 +57,11 @@ impl FileAdapter for ZipAdapter {
|
||||
if is_real_file {
|
||||
use async_zip::read::fs::ZipFileReader;
|
||||
|
||||
let zip = ZipFileReader::new(&filepath_hint).await?;
|
||||
let s = stream! {
|
||||
let zip = ZipFileReader::new(&filepath_hint).await?;
|
||||
for i in 0..zip.entries().len() {
|
||||
let reader = zip.entry_reader(i).await?;
|
||||
let file = reader.entry();
|
||||
for i in 0..zip.file().entries().len() {
|
||||
let file = zip.get_entry(i)?;
|
||||
let reader = zip.entry(i).await?;
|
||||
if file.filename().ends_with('/') {
|
||||
continue;
|
||||
}
|
||||
@ -98,10 +103,11 @@ impl FileAdapter for ZipAdapter {
|
||||
let mut zip = ZipFileReader::new(inp);
|
||||
|
||||
let s = stream! {
|
||||
while !zip.finished() {
|
||||
if let Some(reader) = zip.entry_reader().await? {
|
||||
let file = reader.entry();
|
||||
while let Some(mut entry) = zip.next_entry().await? {
|
||||
let file = entry.entry();
|
||||
if file.filename().ends_with('/') {
|
||||
zip = entry.skip().await?;
|
||||
|
||||
continue;
|
||||
}
|
||||
debug!(
|
||||
@ -114,6 +120,7 @@ impl FileAdapter for ZipAdapter {
|
||||
);
|
||||
let new_line_prefix = format!("{}{}: ", line_prefix, file.filename());
|
||||
let fname = PathBuf::from(file.filename());
|
||||
let reader = entry.reader();
|
||||
tokio::pin!(reader);
|
||||
// SAFETY: this should be solvable without unsafe but idk how :(
|
||||
// the issue is that ZipEntryReader borrows from ZipFileReader, but we need to yield it here into the stream
|
||||
@ -133,7 +140,8 @@ impl FileAdapter for ZipAdapter {
|
||||
postprocess,
|
||||
config: config.clone(),
|
||||
});
|
||||
}
|
||||
zip = entry.done().await.context("going to next file in zip but entry was not read fully")?;
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
@ -183,7 +191,6 @@ impl<'a> AdaptedFilesIter for ZipAdaptIter<'a> {
|
||||
mod test {
|
||||
use async_zip::{write::ZipFileWriter, Compression, ZipEntryBuilder};
|
||||
|
||||
|
||||
use super::*;
|
||||
use crate::{preproc::loop_adapt, test_utils::*};
|
||||
use pretty_assertions::assert_eq;
|
||||
@ -213,7 +220,7 @@ mod test {
|
||||
async fn only_seek_zip_fs() -> Result<()> {
|
||||
let zip = test_data_dir().join("only-seek-zip.zip");
|
||||
let (a, d) = simple_fs_adapt_info(&zip).await?;
|
||||
let _v = adapted_to_vec(loop_adapt(&ZipAdapter::new(), d, a)?).await?;
|
||||
let _v = adapted_to_vec(loop_adapt(&ZipAdapter::new(), d, a).await?).await?;
|
||||
// assert_eq!(String::from_utf8(v)?, "");
|
||||
|
||||
Ok(())
|
||||
@ -236,7 +243,7 @@ mod test {
|
||||
&PathBuf::from("outer.zip"),
|
||||
Box::pin(std::io::Cursor::new(zipfile)),
|
||||
);
|
||||
let buf = adapted_to_vec(loop_adapt(&adapter, d, a)?).await?;
|
||||
let buf = adapted_to_vec(loop_adapt(&adapter, d, a).await?).await?;
|
||||
|
||||
assert_eq!(
|
||||
String::from_utf8(buf)?,
|
||||
|
@ -43,7 +43,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
// happens if e.g. ripgrep detects binary data in the pipe so it cancels reading
|
||||
debug!("output cancelled (broken pipe)");
|
||||
} else {
|
||||
Err(e).context("copying adapter output to stdout {}")?;
|
||||
Err(e).context("copying adapter output to stdout")?;
|
||||
}
|
||||
}
|
||||
debug!("running adapter took {} total", print_dur(start));
|
||||
|
@ -1,17 +1,17 @@
|
||||
use std::pin::Pin;
|
||||
use std::{future::Future, pin::Pin};
|
||||
|
||||
use anyhow::Result;
|
||||
use anyhow::{Context, Result};
|
||||
use async_compression::tokio::write::ZstdEncoder;
|
||||
use async_stream::stream;
|
||||
|
||||
use crate::to_io_err;
|
||||
use log::*;
|
||||
use tokio::io::{AsyncRead, AsyncWriteExt};
|
||||
use tokio_stream::StreamExt;
|
||||
use tokio_util::io::{ReaderStream, StreamReader};
|
||||
|
||||
use crate::to_io_err;
|
||||
|
||||
type FinishHandler = dyn FnOnce((u64, Option<Vec<u8>>)) -> Result<()> + Send;
|
||||
type FinishHandler =
|
||||
dyn FnOnce((u64, Option<Vec<u8>>)) -> Pin<Box<dyn Future<Output = Result<()>> + Send>> + Send;
|
||||
/**
|
||||
* wrap a AsyncRead so that it is passthrough,
|
||||
* but also the written data is compressed and written into a buffer,
|
||||
@ -26,7 +26,7 @@ pub fn async_read_and_write_to_cache<'a>(
|
||||
let inp = Box::pin(inp);
|
||||
let mut zstd_writer = Some(ZstdEncoder::with_quality(
|
||||
Vec::new(),
|
||||
async_compression::Level::Precise(compression_level as u32),
|
||||
async_compression::Level::Precise(compression_level),
|
||||
));
|
||||
let mut bytes_written = 0;
|
||||
|
||||
@ -64,7 +64,7 @@ pub fn async_read_and_write_to_cache<'a>(
|
||||
};
|
||||
|
||||
// EOF, finish!
|
||||
on_finish(finish)
|
||||
on_finish(finish).await.context("write_to_cache on_finish")
|
||||
.map_err(to_io_err)?;
|
||||
|
||||
};
|
||||
|
@ -108,6 +108,7 @@ impl FromStr for CacheMaxBlobLen {
|
||||
rename_all = "kebab-case",
|
||||
about = env!("CARGO_PKG_DESCRIPTION"),
|
||||
author = env!("CARGO_PKG_HOMEPAGE"),
|
||||
long_about="rga: ripgrep, but also search in PDFs, E-Books, Office documents, zip, tar.gz, etc.",
|
||||
// TODO: long_about does not seem to work to only show this on short help
|
||||
after_help = "-h shows a concise overview, --help shows more detail and advanced options.\n\nAll other options not shown here are passed directly to rg, especially [PATTERN] and [PATH ...]",
|
||||
usage = "rga [RGA OPTIONS] [RG OPTIONS] PATTERN [PATH ...]"
|
||||
@ -197,9 +198,9 @@ pub struct CacheConfig {
|
||||
/// Disable caching of results
|
||||
///
|
||||
/// By default, rga caches the extracted text, if it is small enough,
|
||||
/// to a database in ~/.cache/rga on Linux,
|
||||
/// ~/Library/Caches/rga on macOS,
|
||||
/// or C:\Users\username\AppData\Local\rga on Windows.
|
||||
/// to a database in ${XDG_CACHE_DIR-~/.cache}/ripgrep-all on Linux,
|
||||
/// ~/Library/Caches/ripgrep-all on macOS,
|
||||
/// or C:\Users\username\AppData\Local\ripgrep-all on Windows.
|
||||
/// This way, repeated searches on the same set of files will be much faster.
|
||||
/// If you pass this flag, all caching will be disabled.
|
||||
#[serde(default, skip_serializing_if = "is_default")]
|
||||
@ -208,7 +209,9 @@ pub struct CacheConfig {
|
||||
|
||||
/// Max compressed size to cache
|
||||
///
|
||||
/// Longest byte length (after compression) to store in cache. Longer adapter outputs will not be cached and recomputed every time. Allowed suffixes: k M G
|
||||
/// Longest byte length (after compression) to store in cache. Longer adapter outputs will not be cached and recomputed every time.
|
||||
///
|
||||
/// Allowed suffixes on command line: k M G
|
||||
#[serde(default, skip_serializing_if = "is_default")]
|
||||
#[structopt(
|
||||
default_value,
|
||||
|
105
src/preproc.rs
105
src/preproc.rs
@ -3,25 +3,28 @@ use crate::adapters::*;
|
||||
use crate::caching_writer::async_read_and_write_to_cache;
|
||||
use crate::config::RgaConfig;
|
||||
use crate::matching::*;
|
||||
use crate::preproc_cache::CacheKey;
|
||||
use crate::recurse::concat_read_streams;
|
||||
use crate::{
|
||||
preproc_cache::{LmdbCache, PreprocCache},
|
||||
preproc_cache::{open_cache_db, PreprocCache},
|
||||
print_bytes,
|
||||
};
|
||||
use anyhow::*;
|
||||
use async_compression::tokio::bufread::ZstdDecoder;
|
||||
use async_stream::stream;
|
||||
// use futures::future::{BoxFuture, FutureExt};
|
||||
use log::*;
|
||||
use path_clean::PathClean;
|
||||
use postproc::PostprocPrefix;
|
||||
use std::future::Future;
|
||||
use std::io::Cursor;
|
||||
use std::path::Path;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use tokio::io::AsyncBufRead;
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
use tokio::io::BufReader;
|
||||
|
||||
type ActiveAdapters = Vec<Arc<dyn FileAdapter>>;
|
||||
pub type ActiveAdapters = Vec<Arc<dyn FileAdapter>>;
|
||||
|
||||
async fn choose_adapter(
|
||||
config: &RgaConfig,
|
||||
@ -120,36 +123,6 @@ pub async fn rga_preproc(ai: AdaptInfo) -> Result<ReadBox> {
|
||||
.with_context(|| format!("run_adapter({})", &path_hint_copy.to_string_lossy()))
|
||||
}
|
||||
|
||||
fn compute_cache_key(
|
||||
filepath_hint: &Path,
|
||||
adapter: &dyn FileAdapter,
|
||||
active_adapters: ActiveAdapters,
|
||||
) -> Result<Vec<u8>> {
|
||||
let clean_path = filepath_hint.to_owned().clean();
|
||||
let meta = std::fs::metadata(filepath_hint)
|
||||
.with_context(|| format!("reading metadata for {}", filepath_hint.to_string_lossy()))?;
|
||||
let modified = meta.modified().expect("weird OS that can't into mtime");
|
||||
|
||||
if adapter.metadata().recurses {
|
||||
let active_adapters_cache_key = active_adapters
|
||||
.iter()
|
||||
.map(|a| (a.metadata().name.clone(), a.metadata().version))
|
||||
.collect::<Vec<_>>();
|
||||
let key = (active_adapters_cache_key, clean_path, modified);
|
||||
debug!("Cache key (with recursion): {:?}", key);
|
||||
bincode::serialize(&key).context("could not serialize path")
|
||||
} else {
|
||||
let key = (
|
||||
adapter.metadata().name.clone(),
|
||||
adapter.metadata().version,
|
||||
clean_path,
|
||||
modified,
|
||||
);
|
||||
debug!("Cache key (no recursion): {:?}", key);
|
||||
bincode::serialize(&key).context("could not serialize path")
|
||||
}
|
||||
}
|
||||
|
||||
async fn adapt_caching(
|
||||
ai: AdaptInfo,
|
||||
adapter: Arc<dyn FileAdapter>,
|
||||
@ -166,41 +139,44 @@ async fn adapt_caching(
|
||||
ai.filepath_hint.to_string_lossy(),
|
||||
&meta.name
|
||||
);
|
||||
let db_name = format!("{}.v{}", meta.name, meta.version);
|
||||
let cache_compression_level = ai.config.cache.compression_level;
|
||||
let cache_max_blob_len = ai.config.cache.max_blob_len;
|
||||
|
||||
let cache = if ai.is_real_file {
|
||||
LmdbCache::open(&ai.config.cache)?
|
||||
let cache = if ai.is_real_file && !ai.config.cache.disabled {
|
||||
Some(open_cache_db(Path::new(&ai.config.cache.path.0)).await?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let mut cache = cache.context("No cache?")?;
|
||||
let cache_key: Vec<u8> =
|
||||
compute_cache_key(&ai.filepath_hint, adapter.as_ref(), active_adapters)?;
|
||||
let cache_key = CacheKey::new(&ai.filepath_hint, adapter.as_ref(), &active_adapters)?;
|
||||
// let dbg_ctx = format!("adapter {}", &adapter.metadata().name);
|
||||
let cached = cache.get(&db_name, &cache_key)?;
|
||||
let cached = cache.get(&cache_key).await.context("cache.get")?;
|
||||
match cached {
|
||||
Some(cached) => Ok(Box::pin(ZstdDecoder::new(Cursor::new(cached)))),
|
||||
None => {
|
||||
debug!("cache MISS, running adapter with caching...");
|
||||
let inp = loop_adapt(adapter.as_ref(), detection_reason, ai)?;
|
||||
let inp = loop_adapt(adapter.as_ref(), detection_reason, ai).await?;
|
||||
let inp = concat_read_streams(inp);
|
||||
let inp = async_read_and_write_to_cache(
|
||||
inp,
|
||||
cache_max_blob_len.0,
|
||||
cache_compression_level.0,
|
||||
Box::new(move |(uncompressed_size, compressed)| {
|
||||
debug!(
|
||||
"uncompressed output: {}",
|
||||
print_bytes(uncompressed_size as f64)
|
||||
);
|
||||
if let Some(cached) = compressed {
|
||||
debug!("compressed output: {}", print_bytes(cached.len() as f64));
|
||||
cache.set(&db_name, &cache_key, &cached)?
|
||||
}
|
||||
Ok(())
|
||||
Box::pin(async move {
|
||||
debug!(
|
||||
"uncompressed output: {}",
|
||||
print_bytes(uncompressed_size as f64)
|
||||
);
|
||||
if let Some(cached) = compressed {
|
||||
debug!("compressed output: {}", print_bytes(cached.len() as f64));
|
||||
cache
|
||||
.set(&cache_key, cached)
|
||||
.await
|
||||
.context("writing to cache")?
|
||||
}
|
||||
Ok(())
|
||||
})
|
||||
}),
|
||||
)?;
|
||||
|
||||
@ -213,21 +189,34 @@ pub fn loop_adapt(
|
||||
adapter: &dyn FileAdapter,
|
||||
detection_reason: FileMatcher,
|
||||
ai: AdaptInfo,
|
||||
) -> Pin<Box<dyn Future<Output = anyhow::Result<AdaptedFilesIterBox>> + Send + '_>> {
|
||||
Box::pin(async move { loop_adapt_inner(adapter, detection_reason, ai).await })
|
||||
}
|
||||
pub async fn loop_adapt_inner(
|
||||
adapter: &dyn FileAdapter,
|
||||
detection_reason: FileMatcher,
|
||||
ai: AdaptInfo,
|
||||
) -> anyhow::Result<AdaptedFilesIterBox> {
|
||||
let fph = ai.filepath_hint.clone();
|
||||
let inp = adapter.adapt(ai, &detection_reason).with_context(|| {
|
||||
format!(
|
||||
"adapting {} via {} failed",
|
||||
fph.to_string_lossy(),
|
||||
adapter.metadata().name
|
||||
)
|
||||
})?;
|
||||
let inp = adapter.adapt(ai, &detection_reason).await;
|
||||
let inp = if adapter.metadata().name == "postprocprefix" {
|
||||
// don't add confusing error context
|
||||
inp?
|
||||
} else {
|
||||
inp.with_context(|| {
|
||||
format!(
|
||||
"adapting {} via {} failed",
|
||||
fph.to_string_lossy(),
|
||||
adapter.metadata().name
|
||||
)
|
||||
})?
|
||||
};
|
||||
let s = stream! {
|
||||
for await file in inp {
|
||||
match buf_choose_adapter(file?).await? {
|
||||
Ret::Recurse(ai, adapter, detection_reason, _active_adapters) => {
|
||||
if ai.archive_recursion_depth >= ai.config.max_archive_recursion.0 {
|
||||
let s = format!("{}[rga: max archive recursion reached ({})]", ai.line_prefix, ai.archive_recursion_depth).into_bytes();
|
||||
let s = format!("{}[rga: max archive recursion reached ({})]\n", ai.line_prefix, ai.archive_recursion_depth).into_bytes();
|
||||
yield Ok(AdaptInfo {
|
||||
inp: Box::pin(Cursor::new(s)),
|
||||
..ai
|
||||
@ -243,7 +232,7 @@ pub fn loop_adapt(
|
||||
ai.filepath_hint.to_string_lossy(),
|
||||
&adapter.metadata().name
|
||||
);
|
||||
for await ifile in loop_adapt(adapter.as_ref(), detection_reason, ai)? {
|
||||
for await ifile in loop_adapt(adapter.as_ref(), detection_reason, ai).await? {
|
||||
yield ifile;
|
||||
}
|
||||
}
|
||||
|
@ -1,135 +1,188 @@
|
||||
use crate::{config::CacheConfig, print_bytes, print_dur};
|
||||
use anyhow::{format_err, Context, Result};
|
||||
use log::*;
|
||||
use rkv::backend::{BackendEnvironmentBuilder, LmdbEnvironment};
|
||||
use std::{fmt::Display, path::Path, time::Instant};
|
||||
use crate::{adapters::FileAdapter, preproc::ActiveAdapters};
|
||||
use anyhow::{Context, Result};
|
||||
use path_clean::PathClean;
|
||||
use rusqlite::{named_params, OptionalExtension};
|
||||
use std::{path::Path, time::UNIX_EPOCH};
|
||||
use tokio_rusqlite::Connection;
|
||||
|
||||
pub trait PreprocCache: Send + Sync {
|
||||
/*/// gets cache at specified key.
|
||||
/// if cache hit, return the resulting data
|
||||
/// else, run the given lambda, and store its result in the cache if present
|
||||
fn get_or_run<'a>(
|
||||
&mut self,
|
||||
db_name: &str,
|
||||
key: &[u8],
|
||||
debug_name: &str,
|
||||
runner: Box<dyn FnOnce() -> Result<Option<Vec<u8>>> + 'a>,
|
||||
) -> Result<Option<Vec<u8>>>;*/
|
||||
|
||||
fn get(&self, db_name: &str, key: &[u8]) -> Result<Option<Vec<u8>>>;
|
||||
fn set(&mut self, db_name: &str, key: &[u8], value: &[u8]) -> Result<()>;
|
||||
#[derive(Clone)]
|
||||
pub struct CacheKey {
|
||||
adapter: String,
|
||||
adapter_version: i32,
|
||||
active_adapters: String,
|
||||
file_path: String,
|
||||
file_mtime_unix_ms: i64,
|
||||
}
|
||||
|
||||
/// opens a LMDB cache
|
||||
fn open_cache_db(
|
||||
path: &Path,
|
||||
) -> Result<std::sync::Arc<std::sync::RwLock<rkv::Rkv<LmdbEnvironment>>>> {
|
||||
std::fs::create_dir_all(path)?;
|
||||
// use rkv::backend::LmdbEnvironmentFlags;
|
||||
|
||||
rkv::Manager::<LmdbEnvironment>::singleton()
|
||||
.write()
|
||||
.map_err(|_| format_err!("could not write cache db manager"))?
|
||||
.get_or_create(path, |p| {
|
||||
let mut builder = rkv::Rkv::environment_builder::<rkv::backend::Lmdb>();
|
||||
builder
|
||||
.set_flags(rkv::EnvironmentFlags::NO_SYNC)
|
||||
.set_flags(rkv::EnvironmentFlags::WRITE_MAP) // not durable cuz it's a cache
|
||||
// i'm not sure why NO_TLS is needed. otherwise LMDB transactions (open readers) will keep piling up until it fails with
|
||||
// LmdbError(ReadersFull). Those "open readers" stay even after the corresponding processes exit.
|
||||
// hope setting this doesn't break integrity
|
||||
.set_flags(rkv::EnvironmentFlags::NO_TLS)
|
||||
// sometimes, this seems to cause the data.mdb file to appear as 2GB in size (with holes), but sometimes not?
|
||||
.set_map_size(2 * 1024 * 1024 * 1024)
|
||||
.set_max_dbs(100)
|
||||
.set_max_readers(128);
|
||||
rkv::Rkv::from_builder(p, builder)
|
||||
impl CacheKey {
|
||||
pub fn new(
|
||||
filepath_hint: &Path,
|
||||
adapter: &dyn FileAdapter,
|
||||
active_adapters: &ActiveAdapters,
|
||||
) -> Result<CacheKey> {
|
||||
let meta = std::fs::metadata(filepath_hint)
|
||||
.with_context(|| format!("reading metadata for {}", filepath_hint.to_string_lossy()))?;
|
||||
let modified = meta.modified().expect("weird OS that can't into mtime");
|
||||
let file_mtime_unix_ms = modified.duration_since(UNIX_EPOCH)?.as_millis() as i64;
|
||||
let active_adapters = if adapter.metadata().recurses {
|
||||
serde_json::to_string(
|
||||
&active_adapters
|
||||
.iter()
|
||||
.map(|a| format!("{}.v{}", a.metadata().name, a.metadata().version))
|
||||
.collect::<Vec<_>>(),
|
||||
)?
|
||||
} else {
|
||||
"null".to_string()
|
||||
};
|
||||
Ok(CacheKey {
|
||||
adapter: adapter.metadata().name.clone(),
|
||||
adapter_version: adapter.metadata().version,
|
||||
file_path: filepath_hint.clean().to_string_lossy().to_string(),
|
||||
file_mtime_unix_ms,
|
||||
active_adapters,
|
||||
})
|
||||
.map_err(|e| format_err!("could not get/create cache db: {}", e))
|
||||
}
|
||||
|
||||
pub struct LmdbCache {
|
||||
db_arc: std::sync::Arc<std::sync::RwLock<rkv::Rkv<LmdbEnvironment>>>,
|
||||
}
|
||||
|
||||
impl LmdbCache {
|
||||
pub fn open(config: &CacheConfig) -> Result<Option<LmdbCache>> {
|
||||
if config.disabled {
|
||||
return Ok(None);
|
||||
}
|
||||
let path = Path::new(&config.path.0);
|
||||
Ok(Some(LmdbCache {
|
||||
db_arc: open_cache_db(path)?,
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct RkvErrWrap(rkv::StoreError);
|
||||
impl Display for RkvErrWrap {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
self.0.fmt(f)
|
||||
#[async_trait::async_trait]
|
||||
pub trait PreprocCache {
|
||||
async fn get(&self, key: &CacheKey) -> Result<Option<Vec<u8>>>;
|
||||
async fn set(&mut self, key: &CacheKey, value: Vec<u8>) -> Result<()>;
|
||||
}
|
||||
|
||||
async fn connect_pragmas(db: &Connection) -> Result<()> {
|
||||
// https://phiresky.github.io/blog/2020/sqlite-performance-tuning/
|
||||
//let want_page_size = 32768;
|
||||
//db.execute(&format!("pragma page_size = {};", want_page_size))
|
||||
// .context("setup pragma 1")?;
|
||||
db.call(|db| {
|
||||
db.execute_batch(
|
||||
"
|
||||
pragma journal_mode = WAL;
|
||||
pragma foreign_keys = on;
|
||||
pragma temp_store = memory;
|
||||
pragma synchronous = off; -- integrity isn't very important here
|
||||
pragma mmap_size = 30000000000;
|
||||
|
||||
create table if not exists preproc_cache (
|
||||
adapter text not null,
|
||||
adapter_version integer not null,
|
||||
created_unix_ms integer not null default (unixepoch() * 1000),
|
||||
active_adapters text not null, -- 'null' if adapter cannot recurse
|
||||
file_path text not null,
|
||||
file_mtime_unix_ms integer not null,
|
||||
text_content_zstd blob not null
|
||||
) strict;
|
||||
|
||||
create unique index if not exists preproc_cache_idx on preproc_cache (adapter, adapter_version, file_path, active_adapters);
|
||||
",
|
||||
)
|
||||
})
|
||||
.await.context("connect_pragmas")?;
|
||||
let jm: i64 = db
|
||||
.call(|db| db.pragma_query_value(None, "application_id", |r| r.get(0)))
|
||||
.await?;
|
||||
if jm != 924716026 {
|
||||
// (probably) newly created db
|
||||
create_pragmas(db).await.context("create_pragmas")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_pragmas(db: &Connection) -> Result<()> {
|
||||
db.call(|db| {
|
||||
db.execute_batch(
|
||||
"
|
||||
pragma application_id = 924716026;
|
||||
pragma user_version = 2; -- todo: on upgrade clear db if version is unexpected
|
||||
",
|
||||
)
|
||||
})
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
struct SqliteCache {
|
||||
db: Connection,
|
||||
}
|
||||
impl SqliteCache {
|
||||
async fn new(path: &Path) -> Result<SqliteCache> {
|
||||
let db = Connection::open(path.join("cache.sqlite3")).await?;
|
||||
connect_pragmas(&db).await?;
|
||||
|
||||
Ok(SqliteCache { db })
|
||||
}
|
||||
}
|
||||
impl std::error::Error for RkvErrWrap {}
|
||||
|
||||
impl PreprocCache for LmdbCache {
|
||||
fn get(&self, db_name: &str, key: &[u8]) -> Result<Option<Vec<u8>>> {
|
||||
let start = Instant::now();
|
||||
let db_env = self
|
||||
.db_arc
|
||||
.read()
|
||||
.map_err(|_| anyhow::anyhow!("Could not open lock, some lock writer panicked"))?;
|
||||
let db = db_env
|
||||
.open_single(db_name, rkv::store::Options::create())
|
||||
.map_err(RkvErrWrap)
|
||||
.context("could not open cache db store")?;
|
||||
|
||||
let reader = db_env.read().expect("could not get reader");
|
||||
let cached = db
|
||||
.get(&reader, key)
|
||||
.map_err(RkvErrWrap)
|
||||
.context("could not read from db")?;
|
||||
|
||||
match cached {
|
||||
Some(rkv::Value::Blob(cached)) => {
|
||||
debug!(
|
||||
"cache HIT, reading {} (compressed) from cache",
|
||||
print_bytes(cached.len() as f64)
|
||||
);
|
||||
debug!("reading from cache took {}", print_dur(start));
|
||||
Ok(Some(Vec::from(cached)))
|
||||
}
|
||||
Some(_) => Err(format_err!("Integrity: value not blob"))?,
|
||||
None => Ok(None),
|
||||
}
|
||||
#[async_trait::async_trait]
|
||||
impl PreprocCache for SqliteCache {
|
||||
async fn get(&self, key: &CacheKey) -> Result<Option<Vec<u8>>> {
|
||||
let key = (*key).clone(); // todo: without cloning
|
||||
Ok(self
|
||||
.db
|
||||
.call(move |db| {
|
||||
db.query_row(
|
||||
"select text_content_zstd from preproc_cache where
|
||||
adapter = :adapter
|
||||
and adapter_version = :adapter_version
|
||||
and active_adapters = :active_adapters
|
||||
and file_path = :file_path
|
||||
and file_mtime_unix_ms = :file_mtime_unix_ms
|
||||
",
|
||||
named_params! {
|
||||
":adapter": &key.adapter,
|
||||
":adapter_version": &key.adapter_version,
|
||||
":active_adapters": &key.active_adapters,
|
||||
":file_path": &key.file_path,
|
||||
":file_mtime_unix_ms": &key.file_mtime_unix_ms
|
||||
},
|
||||
|r| r.get::<_, Vec<u8>>(0),
|
||||
)
|
||||
.optional()
|
||||
})
|
||||
.await
|
||||
.context("reading from cache")?)
|
||||
}
|
||||
fn set(&mut self, db_name: &str, key: &[u8], got: &[u8]) -> Result<()> {
|
||||
let start = Instant::now();
|
||||
debug!("writing {} to cache", print_bytes(got.len() as f64));
|
||||
let db_env = self
|
||||
.db_arc
|
||||
.read()
|
||||
.map_err(|_| anyhow::anyhow!("Could not open lock, some lock writer panicked"))?;
|
||||
|
||||
let db = db_env
|
||||
.open_single(db_name, rkv::store::Options::create())
|
||||
.map_err(RkvErrWrap)
|
||||
.context("could not open cache db store")?;
|
||||
async fn set(&mut self, key: &CacheKey, value: Vec<u8>) -> Result<()> {
|
||||
let key = (*key).clone(); // todo: without cloning
|
||||
Ok(self
|
||||
.db
|
||||
.call(move |db| {
|
||||
db.execute(
|
||||
"insert into preproc_cache (adapter, adapter_version, active_adapters, file_path, file_mtime_unix_ms, text_content_zstd) values
|
||||
(:adapter, :adapter_version, :active_adapters, :file_path, :file_mtime_unix_ms, :text_content_zstd)
|
||||
on conflict (adapter, adapter_version, active_adapters, file_path) do update set
|
||||
file_mtime_unix_ms = :file_mtime_unix_ms,
|
||||
created_unix_ms = unixepoch() * 1000,
|
||||
text_content_zstd = :text_content_zstd",
|
||||
named_params! {
|
||||
":adapter": &key.adapter,
|
||||
":adapter_version": &key.adapter_version,
|
||||
":active_adapters": &key.active_adapters,
|
||||
":file_path": &key.file_path,
|
||||
":file_mtime_unix_ms": &key.file_mtime_unix_ms,
|
||||
":text_content_zstd": value
|
||||
}
|
||||
).map(|_| ())
|
||||
})
|
||||
.await?)
|
||||
}
|
||||
}
|
||||
/// opens a default cache
|
||||
pub async fn open_cache_db(path: &Path) -> Result<impl PreprocCache> {
|
||||
std::fs::create_dir_all(path)?;
|
||||
SqliteCache::new(path).await
|
||||
}
|
||||
|
||||
let mut writer = db_env
|
||||
.write()
|
||||
.map_err(RkvErrWrap)
|
||||
.with_context(|| format_err!("could not open write handle to cache"))?;
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
db.put(&mut writer, key, &rkv::Value::Blob(got))
|
||||
.map_err(RkvErrWrap)
|
||||
.with_context(|| format_err!("could not write to cache"))?;
|
||||
writer
|
||||
.commit()
|
||||
.map_err(RkvErrWrap)
|
||||
.with_context(|| "could not write cache".to_string())?;
|
||||
debug!("writing to cache took {}", print_dur(start));
|
||||
use crate::preproc_cache::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_read_write() -> anyhow::Result<()> {
|
||||
let path = tempfile::tempdir()?;
|
||||
let _db = open_cache_db(&path.path().join("foo.sqlite3")).await?;
|
||||
// db.set();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user