mirror of
https://github.com/FliegendeWurst/ripgrep-all.git
synced 2024-11-08 14:00:37 +00:00
Recognize files starting with 'From ' as mbox
Seems tree_magic does not know about this format
This commit is contained in:
parent
780aa3b134
commit
f72e6733d3
@ -4,12 +4,10 @@ use anyhow::Result;
|
|||||||
use async_stream::stream;
|
use async_stream::stream;
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use mime2ext::mime2ext;
|
use mime2ext::mime2ext;
|
||||||
|
use regex::bytes::Regex;
|
||||||
use tokio::io::AsyncReadExt;
|
use tokio::io::AsyncReadExt;
|
||||||
|
|
||||||
use std::{
|
use std::{collections::VecDeque, io::Cursor};
|
||||||
collections::VecDeque,
|
|
||||||
io::Cursor,
|
|
||||||
};
|
|
||||||
|
|
||||||
static EXTENSIONS: &[&str] = &["mbox", "mbx", "eml"];
|
static EXTENSIONS: &[&str] = &["mbox", "mbx", "eml"];
|
||||||
static MIME_TYPES: &[&str] = &["application/mbox", "message/rfc822"];
|
static MIME_TYPES: &[&str] = &["application/mbox", "message/rfc822"];
|
||||||
@ -17,8 +15,9 @@ lazy_static! {
|
|||||||
static ref METADATA: AdapterMeta = AdapterMeta {
|
static ref METADATA: AdapterMeta = AdapterMeta {
|
||||||
name: "mail".to_owned(),
|
name: "mail".to_owned(),
|
||||||
version: 1,
|
version: 1,
|
||||||
description: "Reads mailbox/mail files and runs extractors on the contents and attachments."
|
description:
|
||||||
.to_owned(),
|
"Reads mailbox/mail files and runs extractors on the contents and attachments."
|
||||||
|
.to_owned(),
|
||||||
recurses: true,
|
recurses: true,
|
||||||
fast_matchers: EXTENSIONS
|
fast_matchers: EXTENSIONS
|
||||||
.iter()
|
.iter()
|
||||||
@ -33,6 +32,7 @@ lazy_static! {
|
|||||||
disabled_by_default: true,
|
disabled_by_default: true,
|
||||||
keep_fast_matchers_if_accurate: true
|
keep_fast_matchers_if_accurate: true
|
||||||
};
|
};
|
||||||
|
static ref FROM_REGEX: Regex = Regex::new("\r?\nFrom [^\n]+\n").unwrap();
|
||||||
}
|
}
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct MboxAdapter;
|
pub struct MboxAdapter;
|
||||||
@ -65,16 +65,18 @@ impl FileAdapter for MboxAdapter {
|
|||||||
..
|
..
|
||||||
} = ai;
|
} = ai;
|
||||||
|
|
||||||
let mut content = String::new();
|
let mut content = Vec::new();
|
||||||
let s = stream! {
|
let s = stream! {
|
||||||
inp.read_to_string(&mut content).await?;
|
inp.read_to_end(&mut content).await?;
|
||||||
|
|
||||||
let mut ais = vec![];
|
let mut ais = vec![];
|
||||||
for mail in content.split("\nFrom ") {
|
for mail_bytes in FROM_REGEX.splitn(&content, usize::MAX) {
|
||||||
|
|
||||||
let mail_bytes = mail.as_bytes(); // &content[offset..offset2];
|
|
||||||
let mail_content = mail_bytes.splitn(2, |x| *x == b'\n').skip(1).next().unwrap();
|
let mail_content = mail_bytes.splitn(2, |x| *x == b'\n').skip(1).next().unwrap();
|
||||||
let mail = mailparse::parse_mail(mail_content)?;
|
let mail = mailparse::parse_mail(mail_content);
|
||||||
|
if mail.is_err() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let mail = mail.unwrap();
|
||||||
|
|
||||||
let mut todos = VecDeque::new();
|
let mut todos = VecDeque::new();
|
||||||
todos.push_back(mail);
|
todos.push_back(mail);
|
||||||
@ -101,11 +103,15 @@ impl FileAdapter for MboxAdapter {
|
|||||||
let mut config = config.clone();
|
let mut config = config.clone();
|
||||||
config.accurate = true;
|
config.accurate = true;
|
||||||
|
|
||||||
|
let raw_body = mail.get_body_raw();
|
||||||
|
if raw_body.is_err() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
let ai2: AdaptInfo = AdaptInfo {
|
let ai2: AdaptInfo = AdaptInfo {
|
||||||
filepath_hint: path,
|
filepath_hint: path,
|
||||||
is_real_file: false,
|
is_real_file: false,
|
||||||
archive_recursion_depth: archive_recursion_depth + 1,
|
archive_recursion_depth: archive_recursion_depth + 1,
|
||||||
inp: Box::pin(Cursor::new(mail.get_body_raw()?)),
|
inp: Box::pin(Cursor::new(raw_body.unwrap())),
|
||||||
line_prefix: line_prefix.to_string(),
|
line_prefix: line_prefix.to_string(),
|
||||||
config: config,
|
config: config,
|
||||||
postprocess,
|
postprocess,
|
||||||
@ -143,10 +149,18 @@ mod tests {
|
|||||||
let mut file = file?;
|
let mut file = file?;
|
||||||
let mut buf = Vec::new();
|
let mut buf = Vec::new();
|
||||||
file.inp.read_to_end(&mut buf).await?;
|
file.inp.read_to_end(&mut buf).await?;
|
||||||
match file.filepath_hint.components().last().unwrap().as_os_str().to_str().unwrap() {
|
match file
|
||||||
|
.filepath_hint
|
||||||
|
.components()
|
||||||
|
.last()
|
||||||
|
.unwrap()
|
||||||
|
.as_os_str()
|
||||||
|
.to_str()
|
||||||
|
.unwrap()
|
||||||
|
{
|
||||||
"data.txt" | "data.html" => {
|
"data.txt" | "data.html" => {
|
||||||
assert!(String::from_utf8(buf)?.contains("Thank you for your contribution"));
|
assert!(String::from_utf8(buf)?.contains("Thank you for your contribution"));
|
||||||
},
|
}
|
||||||
x => panic!("unexpected filename {x:?}"),
|
x => panic!("unexpected filename {x:?}"),
|
||||||
}
|
}
|
||||||
count += 1;
|
count += 1;
|
||||||
@ -181,6 +195,8 @@ mod tests {
|
|||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn mbox_attachment() -> Result<()> {
|
async fn mbox_attachment() -> Result<()> {
|
||||||
|
init_logging();
|
||||||
|
|
||||||
let adapter = MboxAdapter;
|
let adapter = MboxAdapter;
|
||||||
|
|
||||||
let filepath = test_data_dir().join("mail_with_attachment.mbox");
|
let filepath = test_data_dir().join("mail_with_attachment.mbox");
|
||||||
@ -202,10 +218,13 @@ mod tests {
|
|||||||
file.inp.read_to_end(&mut buf).await?;
|
file.inp.read_to_end(&mut buf).await?;
|
||||||
match path {
|
match path {
|
||||||
"data.html.txt" => {
|
"data.html.txt" => {
|
||||||
assert_eq!("PREFIX:regular text\nPREFIX:\n", String::from_utf8(buf)?);
|
assert_eq!(
|
||||||
|
"PREFIX:regular text\nPREFIX:\n",
|
||||||
|
String::from_utf8(buf).unwrap_or("err".to_owned())
|
||||||
|
);
|
||||||
}
|
}
|
||||||
"short.pdf.txt" => {
|
"short.pdf.txt" => {
|
||||||
assert_eq!("PREFIX:Page 1: hello world\nPREFIX:Page 1: this is just a test.\nPREFIX:Page 1: \nPREFIX:Page 1: 1\nPREFIX:Page 1: \nPREFIX:Page 1: \n", String::from_utf8(buf)?);
|
assert_eq!("PREFIX:Page 1: hello world\nPREFIX:Page 1: this is just a test.\nPREFIX:Page 1: \nPREFIX:Page 1: 1\nPREFIX:Page 1: \nPREFIX:Page 1: \n", String::from_utf8(buf).unwrap_or("err".to_owned()));
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
panic!("unrelated {path:?}");
|
panic!("unrelated {path:?}");
|
||||||
|
@ -41,9 +41,13 @@ async fn choose_adapter(
|
|||||||
|
|
||||||
let mimetype = if config.accurate {
|
let mimetype = if config.accurate {
|
||||||
let buf = inp.fill_buf().await?; // fill but do not consume!
|
let buf = inp.fill_buf().await?; // fill but do not consume!
|
||||||
let mimetype = tree_magic::from_u8(buf);
|
if buf.starts_with(b"From \x0d") || buf.starts_with(b"From -") {
|
||||||
debug!("mimetype: {:?}", mimetype);
|
Some("application/mbox")
|
||||||
Some(mimetype)
|
} else {
|
||||||
|
let mimetype = tree_magic::from_u8(buf);
|
||||||
|
debug!("mimetype: {:?}", mimetype);
|
||||||
|
Some(mimetype)
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
};
|
};
|
||||||
|
@ -46,7 +46,11 @@ pub fn simple_adapt_info_full(
|
|||||||
postprocess: true,
|
postprocess: true,
|
||||||
},
|
},
|
||||||
FastFileMatcher::FileExtension(
|
FastFileMatcher::FileExtension(
|
||||||
filepath.extension().unwrap().to_string_lossy().into_owned(),
|
filepath
|
||||||
|
.extension()
|
||||||
|
.unwrap_or_default()
|
||||||
|
.to_string_lossy()
|
||||||
|
.into_owned(),
|
||||||
)
|
)
|
||||||
.into(),
|
.into(),
|
||||||
)
|
)
|
||||||
@ -68,3 +72,8 @@ pub fn poppler_adapter() -> CustomSpawningFileAdapter {
|
|||||||
|
|
||||||
adapter.to_adapter()
|
adapter.to_adapter()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
pub fn init_logging() {
|
||||||
|
let _ = env_logger::builder().is_test(true).try_init();
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user