Recognize files starting with 'From ' as mbox

Seems tree_magic does not know about this format
This commit is contained in:
FliegendeWurst 2023-07-31 17:23:08 +02:00
parent 780aa3b134
commit f72e6733d3
3 changed files with 53 additions and 21 deletions

View File

@ -4,12 +4,10 @@ use anyhow::Result;
use async_stream::stream; use async_stream::stream;
use lazy_static::lazy_static; use lazy_static::lazy_static;
use mime2ext::mime2ext; use mime2ext::mime2ext;
use regex::bytes::Regex;
use tokio::io::AsyncReadExt; use tokio::io::AsyncReadExt;
use std::{ use std::{collections::VecDeque, io::Cursor};
collections::VecDeque,
io::Cursor,
};
static EXTENSIONS: &[&str] = &["mbox", "mbx", "eml"]; static EXTENSIONS: &[&str] = &["mbox", "mbx", "eml"];
static MIME_TYPES: &[&str] = &["application/mbox", "message/rfc822"]; static MIME_TYPES: &[&str] = &["application/mbox", "message/rfc822"];
@ -17,8 +15,9 @@ lazy_static! {
static ref METADATA: AdapterMeta = AdapterMeta { static ref METADATA: AdapterMeta = AdapterMeta {
name: "mail".to_owned(), name: "mail".to_owned(),
version: 1, version: 1,
description: "Reads mailbox/mail files and runs extractors on the contents and attachments." description:
.to_owned(), "Reads mailbox/mail files and runs extractors on the contents and attachments."
.to_owned(),
recurses: true, recurses: true,
fast_matchers: EXTENSIONS fast_matchers: EXTENSIONS
.iter() .iter()
@ -33,6 +32,7 @@ lazy_static! {
disabled_by_default: true, disabled_by_default: true,
keep_fast_matchers_if_accurate: true keep_fast_matchers_if_accurate: true
}; };
static ref FROM_REGEX: Regex = Regex::new("\r?\nFrom [^\n]+\n").unwrap();
} }
#[derive(Default)] #[derive(Default)]
pub struct MboxAdapter; pub struct MboxAdapter;
@ -65,16 +65,18 @@ impl FileAdapter for MboxAdapter {
.. ..
} = ai; } = ai;
let mut content = String::new(); let mut content = Vec::new();
let s = stream! { let s = stream! {
inp.read_to_string(&mut content).await?; inp.read_to_end(&mut content).await?;
let mut ais = vec![]; let mut ais = vec![];
for mail in content.split("\nFrom ") { for mail_bytes in FROM_REGEX.splitn(&content, usize::MAX) {
let mail_bytes = mail.as_bytes(); // &content[offset..offset2];
let mail_content = mail_bytes.splitn(2, |x| *x == b'\n').skip(1).next().unwrap(); let mail_content = mail_bytes.splitn(2, |x| *x == b'\n').skip(1).next().unwrap();
let mail = mailparse::parse_mail(mail_content)?; let mail = mailparse::parse_mail(mail_content);
if mail.is_err() {
continue;
}
let mail = mail.unwrap();
let mut todos = VecDeque::new(); let mut todos = VecDeque::new();
todos.push_back(mail); todos.push_back(mail);
@ -101,11 +103,15 @@ impl FileAdapter for MboxAdapter {
let mut config = config.clone(); let mut config = config.clone();
config.accurate = true; config.accurate = true;
let raw_body = mail.get_body_raw();
if raw_body.is_err() {
continue;
}
let ai2: AdaptInfo = AdaptInfo { let ai2: AdaptInfo = AdaptInfo {
filepath_hint: path, filepath_hint: path,
is_real_file: false, is_real_file: false,
archive_recursion_depth: archive_recursion_depth + 1, archive_recursion_depth: archive_recursion_depth + 1,
inp: Box::pin(Cursor::new(mail.get_body_raw()?)), inp: Box::pin(Cursor::new(raw_body.unwrap())),
line_prefix: line_prefix.to_string(), line_prefix: line_prefix.to_string(),
config: config, config: config,
postprocess, postprocess,
@ -143,10 +149,18 @@ mod tests {
let mut file = file?; let mut file = file?;
let mut buf = Vec::new(); let mut buf = Vec::new();
file.inp.read_to_end(&mut buf).await?; file.inp.read_to_end(&mut buf).await?;
match file.filepath_hint.components().last().unwrap().as_os_str().to_str().unwrap() { match file
.filepath_hint
.components()
.last()
.unwrap()
.as_os_str()
.to_str()
.unwrap()
{
"data.txt" | "data.html" => { "data.txt" | "data.html" => {
assert!(String::from_utf8(buf)?.contains("Thank you for your contribution")); assert!(String::from_utf8(buf)?.contains("Thank you for your contribution"));
}, }
x => panic!("unexpected filename {x:?}"), x => panic!("unexpected filename {x:?}"),
} }
count += 1; count += 1;
@ -181,6 +195,8 @@ mod tests {
#[tokio::test] #[tokio::test]
async fn mbox_attachment() -> Result<()> { async fn mbox_attachment() -> Result<()> {
init_logging();
let adapter = MboxAdapter; let adapter = MboxAdapter;
let filepath = test_data_dir().join("mail_with_attachment.mbox"); let filepath = test_data_dir().join("mail_with_attachment.mbox");
@ -202,10 +218,13 @@ mod tests {
file.inp.read_to_end(&mut buf).await?; file.inp.read_to_end(&mut buf).await?;
match path { match path {
"data.html.txt" => { "data.html.txt" => {
assert_eq!("PREFIX:regular text\nPREFIX:\n", String::from_utf8(buf)?); assert_eq!(
"PREFIX:regular text\nPREFIX:\n",
String::from_utf8(buf).unwrap_or("err".to_owned())
);
} }
"short.pdf.txt" => { "short.pdf.txt" => {
assert_eq!("PREFIX:Page 1: hello world\nPREFIX:Page 1: this is just a test.\nPREFIX:Page 1: \nPREFIX:Page 1: 1\nPREFIX:Page 1: \nPREFIX:Page 1: \n", String::from_utf8(buf)?); assert_eq!("PREFIX:Page 1: hello world\nPREFIX:Page 1: this is just a test.\nPREFIX:Page 1: \nPREFIX:Page 1: 1\nPREFIX:Page 1: \nPREFIX:Page 1: \n", String::from_utf8(buf).unwrap_or("err".to_owned()));
} }
_ => { _ => {
panic!("unrelated {path:?}"); panic!("unrelated {path:?}");

View File

@ -41,9 +41,13 @@ async fn choose_adapter(
let mimetype = if config.accurate { let mimetype = if config.accurate {
let buf = inp.fill_buf().await?; // fill but do not consume! let buf = inp.fill_buf().await?; // fill but do not consume!
let mimetype = tree_magic::from_u8(buf); if buf.starts_with(b"From \x0d") || buf.starts_with(b"From -") {
debug!("mimetype: {:?}", mimetype); Some("application/mbox")
Some(mimetype) } else {
let mimetype = tree_magic::from_u8(buf);
debug!("mimetype: {:?}", mimetype);
Some(mimetype)
}
} else { } else {
None None
}; };

View File

@ -46,7 +46,11 @@ pub fn simple_adapt_info_full(
postprocess: true, postprocess: true,
}, },
FastFileMatcher::FileExtension( FastFileMatcher::FileExtension(
filepath.extension().unwrap().to_string_lossy().into_owned(), filepath
.extension()
.unwrap_or_default()
.to_string_lossy()
.into_owned(),
) )
.into(), .into(),
) )
@ -68,3 +72,8 @@ pub fn poppler_adapter() -> CustomSpawningFileAdapter {
adapter.to_adapter() adapter.to_adapter()
} }
#[cfg(test)]
pub fn init_logging() {
let _ = env_logger::builder().is_test(true).try_init();
}