Initial work on mbox extractor

This commit is contained in:
FliegendeWurst 2023-07-31 14:30:49 +02:00
parent 5fa777605d
commit 9642552fa3
8 changed files with 8167 additions and 15 deletions

142
Cargo.lock generated
View File

@ -25,7 +25,7 @@ version = "0.7.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac"
dependencies = [
"memchr",
"memchr 2.5.0",
]
[[package]]
@ -85,7 +85,7 @@ dependencies = [
"flate2",
"futures-core",
"futures-io",
"memchr",
"memchr 2.5.0",
"pin-project-lite 0.2.9",
"tokio 0.2.25",
"tokio 0.3.7",
@ -179,6 +179,12 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "base64"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
[[package]]
name = "bincode"
version = "1.3.3"
@ -188,6 +194,12 @@ dependencies = [
"serde",
]
[[package]]
name = "bitflags"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4efd02e230a02e18f92fc2735f44597385ed02ad8f831e7c1c1156ee5e1ab3a5"
[[package]]
name = "bitflags"
version = "1.3.2"
@ -266,6 +278,12 @@ dependencies = [
"pkg-config",
]
[[package]]
name = "casing"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8069a314fdf61ab368081307e6d351431b2a1a04822eba87834230a238cb93c4"
[[package]]
name = "cc"
version = "1.0.79"
@ -281,6 +299,16 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "charset"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "18e9079d1a12a2cc2bffb5db039c43661836ead4082120d5844f02555aca2d46"
dependencies = [
"base64",
"encoding_rs",
]
[[package]]
name = "chrono"
version = "0.4.23"
@ -304,7 +332,7 @@ checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c"
dependencies = [
"ansi_term",
"atty",
"bitflags",
"bitflags 1.3.2",
"strsim 0.8.0",
"textwrap",
"unicode-width",
@ -317,7 +345,7 @@ version = "4.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0b0588d44d4d63a87dbd75c136c166bbfd9a86a31cb89e09906521c7d3f5e3"
dependencies = [
"bitflags",
"bitflags 1.3.2",
"clap_lex",
"is-terminal",
"strsim 0.10.0",
@ -486,6 +514,12 @@ dependencies = [
"syn",
]
[[package]]
name = "data-encoding"
version = "2.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "23d8666cb01533c39dde32bcbab8e227b4ed6679b2c925eba05feabea39508fb"
[[package]]
name = "derive_more"
version = "0.99.17"
@ -966,7 +1000,7 @@ version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "447a296f7aca299cfbb50f4e4f3d49451549af655fb7215d7f8c0c3d64bad42b"
dependencies = [
"bitflags",
"bitflags 1.3.2",
"byteorder",
"libc",
"lmdb-rkv-sys",
@ -1013,6 +1047,41 @@ dependencies = [
"pkg-config",
]
[[package]]
name = "mailbox"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "602ce3fa1224e605e1c9949ff090baf17e3c84e8d87123ee31d838a2ed5ed9f5"
dependencies = [
"bitflags 0.9.1",
"casing",
"chrono",
"fnv",
"mime",
"nom 3.2.1",
"owning_ref",
]
[[package]]
name = "mailparse"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b56570f5f8c0047260d1c8b5b331f62eb9c660b9dd4071a8c46f8c7d3f280aa"
dependencies = [
"charset",
"data-encoding",
"quoted_printable",
]
[[package]]
name = "memchr"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "148fab2e51b4f1cfc66da2a7c32981d1d3c083a803978268bb11fe4b86925e7a"
dependencies = [
"libc",
]
[[package]]
name = "memchr"
version = "2.5.0"
@ -1028,6 +1097,18 @@ dependencies = [
"autocfg",
]
[[package]]
name = "mime"
version = "0.3.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
[[package]]
name = "mime2ext"
version = "0.1.52"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1a85a5069ebd40e64b1985773cc81addbe9d90d7ecf60e7b5475a57ad584c70"
[[package]]
name = "minimal-lexical"
version = "0.2.1"
@ -1055,13 +1136,22 @@ dependencies = [
"windows-sys 0.45.0",
]
[[package]]
name = "nom"
version = "3.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05aec50c70fd288702bcd93284a8444607f3292dbdf2a30de5ea5dcdbe72287b"
dependencies = [
"memchr 1.0.2",
]
[[package]]
name = "nom"
version = "7.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
dependencies = [
"memchr",
"memchr 2.5.0",
"minimal-lexical",
]
@ -1169,6 +1259,15 @@ dependencies = [
"winapi",
]
[[package]]
name = "owning_ref"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cdf84f41639e037b484f93433aa3897863b561ed65c6e59c7073d7c561710f37"
dependencies = [
"stable_deref_trait",
]
[[package]]
name = "parking_lot"
version = "0.12.1"
@ -1308,13 +1407,19 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "quoted_printable"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a24039f627d8285853cc90dcddf8c1ebfaa91f834566948872b225b9a28ed1b6"
[[package]]
name = "redox_syscall"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
dependencies = [
"bitflags",
"bitflags 1.3.2",
]
[[package]]
@ -1335,7 +1440,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733"
dependencies = [
"aho-corasick",
"memchr",
"memchr 2.5.0",
"regex-syntax",
]
@ -1381,7 +1486,10 @@ dependencies = [
"json_comments",
"lazy_static",
"log",
"memchr",
"mailbox",
"mailparse",
"memchr 2.5.0",
"mime2ext",
"paste",
"path-clean",
"pretty-bytes",
@ -1411,7 +1519,7 @@ checksum = "a6006704273063c72952370ad236b8d58556dcc4f99a95ced4d9ad40f3e80a69"
dependencies = [
"arrayref",
"bincode",
"bitflags",
"bitflags 1.3.2",
"byteorder",
"id-arena",
"lazy_static",
@ -1432,7 +1540,7 @@ version = "0.28.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01e213bc3ecb39ac32e81e51ebe31fd888a940515173e3a18a35f8c6e896422a"
dependencies = [
"bitflags",
"bitflags 1.3.2",
"fallible-iterator",
"fallible-streaming-iterator",
"hashlink",
@ -1455,7 +1563,7 @@ version = "0.36.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f43abb88211988493c1abb44a70efa56ff0ce98f233b7b276146f1f3f7ba9644"
dependencies = [
"bitflags",
"bitflags 1.3.2",
"errno",
"io-lifetimes",
"libc",
@ -1598,6 +1706,12 @@ dependencies = [
"winapi",
]
[[package]]
name = "stable_deref_trait"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "strsim"
version = "0.8.0"
@ -1762,7 +1876,7 @@ dependencies = [
"autocfg",
"bytes 1.4.0",
"libc",
"memchr",
"memchr 2.5.0",
"mio",
"num_cpus",
"parking_lot",
@ -1870,7 +1984,7 @@ dependencies = [
"bytecount",
"fnv",
"lazy_static",
"nom",
"nom 7.1.3",
"once_cell",
"petgraph",
]

View File

@ -37,7 +37,10 @@ glob = "0.3.0"
json_comments = "0.2.1"
lazy_static = "1.4.0"
log = "0.4.11"
mailbox = "0.2.0"
mailparse = "0.14.0"
memchr = "2.3.3"
mime2ext = "0.1.52"
paste = "1.0.0"
path-clean = "0.1.0"
pretty-bytes = "0.2.2"

217
exampledir/mail_nested.eml Normal file
View File

@ -0,0 +1,217 @@
To: submit.t4eseGWSvG1JST3r@spam.spamcop.net
From: 2012gdwu <2012gdwu@posteo.de>
Subject: Postbank Spam
Autocrypt: addr=2012gdwu@posteo.de; keydata=
mDMEXXjwiRYJKwYBBAHaRw8BAQdAmjXRazNXXy5tK05Dwl5mSRbdth9JkQq92V/QVyqjdgm0
I0FybmUgS2VsbGVyIDxhcm5lLmtlbGxlckBwb3N0ZW8uZGU+iJYEExYIAD4WIQR2UN3HoAGx
KI0B7Eih+UCxBQvPLgUCXXjwiQIbAwUJCWYBgAULCQgHAgYVCgkICwIEFgIDAQIeAQIXgAAK
CRCh+UCxBQvPLpPfAP4gs6Oky3+UO2LU2XxweeQO+YEWXK0QtM2+ajzrGaF3HAD+LBfmyB9+
Wom2KP0CwxUzI4d6zmiAMSKOnGGgzd65igm4OARdePCJEgorBgEEAZdVAQUBAQdAncxZ3Rox
wmvm+/qCkCm9+PU2HmWr08M3qdqkf2L4IngDAQgHiH4EGBYIACYWIQR2UN3HoAGxKI0B7Eih
+UCxBQvPLgUCXXjwiQIbDAUJCWYBgAAKCRCh+UCxBQvPLpQkAQCgYOlOftMNi+sfn+XQvfOc
ULQWp+cgOBMcyVCdpJEQCwD9HBuwuHobl8FPm0PbRtlCn/7GY4WK+Hh4+3BKmhRn8wU=
Message-ID: <1530ae05-33a7-fa40-9473-ca625a14385a@posteo.de>
Date: Mon, 20 Jul 2020 07:35:55 +0200
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101
Thunderbird/68.10.0
MIME-Version: 1.0
Content-Type: multipart/mixed;
boundary="------------6670F92201FB126ED9472803"
Content-Language: de-DE
This is a multi-part message in MIME format.
--------------6670F92201FB126ED9472803
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: 7bit
here you go
--------------6670F92201FB126ED9472803
Content-Type: message/rfc822;
name="postbank.eml"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
filename="postbank.eml"
Return-Path: <gxnwgddl@carcarry.de>
Delivered-To: arne.keller@posteo.de
Received: from proxy02.posteo.name ([127.0.0.1])
by dovecot12 (Dovecot) with LMTP id EaKBGxv9FF+9mwEAJesNpQ
for <arne.keller@posteo.de>; Mon, 20 Jul 2020 04:15:27 +0200
Received: from proxy02.posteo.de ([127.0.0.1])
by proxy02.posteo.name (Dovecot) with LMTP id 31UFGtHsFF+T4gMAGFAyLg
; Mon, 20 Jul 2020 04:15:27 +0200
Received: from mailin05.posteo.de (unknown [10.0.1.5])
by proxy02.posteo.de (Postfix) with ESMTPS id 4B950v2JYGz11fk
for <arne.keller@posteo.de>; Mon, 20 Jul 2020 04:15:27 +0200 (CEST)
Received: from mx03.posteo.de (mailin05.posteo.de [127.0.0.1])
by mailin05.posteo.de (Postfix) with ESMTPS id 4270120F15
for <arne.keller@posteo.de>; Mon, 20 Jul 2020 04:15:27 +0200 (CEST)
X-Virus-Scanned: amavisd-new at posteo.de
X-Spam-Flag: NO
X-Spam-Score: 2.639
X-Spam-Level: **
X-Spam-Status: No, score=2.639 tagged_above=-1000 required=8
tests=[AV:Heuristics.Phishing.Email.SpoofedDomain=0.1, ALL_TRUSTED=-1,
FROM_LOCAL_NOVOWEL=0.5, HK_RANDOM_ENVFROM=0.626, HK_RANDOM_FROM=0.999,
HTML_FONT_LOW_CONTRAST=0.001, HTML_IMAGE_ONLY_24=1.282,
HTML_MESSAGE=0.001, HTTPS_HTTP_MISMATCH=0.1, POSTEO_GENERICS_IO=0.01,
T_FILL_THIS_FORM_SHORT=0.01, T_REMOTE_IMAGE=0.01] autolearn=disabled
Received: from mout.web.de (mout.web.de [212.227.15.14])
by mx03.posteo.de (Postfix) with ESMTPS id 4B950t696Mz10nB
for <arne.keller@posteo.de>; Mon, 20 Jul 2020 04:15:26 +0200 (CEST)
Authentication-Results: mx03.posteo.de; dmarc=none (p=none dis=none) header.from=carcarry.de
Received: from [212.227.15.17] ([212.227.15.17]) by mx-ha.web.de (mxweb010
[212.227.15.17]) with ESMTPS (Nemesis) id 1MRloE-1kQNT22I4w-00T9hm for
<arne.keller@posteo.de>; Mon, 20 Jul 2020 04:15:26 +0200
Received: from mout.kundenserver.de ([212.227.17.24]) by mx-ha.web.de
(mxweb010 [212.227.15.17]) with ESMTPS (Nemesis) id 1MINbE-1k0aRm2Hzw-00EOVM
for <2012gdwu@web.de>; Mon, 20 Jul 2020 04:15:26 +0200
Received: from 217.160.251.109 ([217.160.251.109]) by mrelayeu.kundenserver.de
(mreue107 [212.227.15.183]) with ESMTPSA (Nemesis) id
1MPoPd-1kBHRt0o2F-00MqkS for <2012gdwu@web.de>; Mon, 20 Jul 2020 04:15:26
+0200
From: "=?utf-8?B?UE9TVEJBTs2fS82f?=" <gxnwgddl@carcarry.de>
Subject: BsetSign App : Y7P32-HTXU2-FRDG7
To: "2012gdwu" <2012gdwu@web.de>
Content-Type: multipart/alternative; boundary="QHebeB08yNTYquFAhtQnxv=_cOW4Xd528c"
MIME-Version: 1.0
Date: Mon, 20 Jul 2020 02:15:26 +0000
Message-ID: <1M3lHZ-1jyAPt0pTn-000u1I@mrelayeu.kundenserver.de>
X-Provags-ID: V03:K1:68TECBVA88ZKh8HcSl/N+ElwlecL1tc+1AuDDyqm9em66WO295R
IfuHqA9uG7+Vlyr99v+OneGltnr43KfsgRKj9GgOpDj2QelHphKFGPILAvvsQ8vOq6ucC2W
BW3NEOh3JhitB6o4xLEmj+dbivC0ie728/cPMcjj6TwyBzw5nT1or8mBZWoEMSF/zcu+PIr
gGpFY2puzzURN4oKX82/w==
X-Spam-Flag: NO
X-UI-Out-Filterresults: notjunk:1;V03:K0:c01ZANnvlk8=:ouSMGue72FUx2PJOSNnmEW
qI8A89gf6q3aAdJBhLX1Bhd70xio64ljpha9X5ArOYg6Q2RH1JYyvfBSMoTo3HMy37H3L8kaq
ReRCdSPOMD8+llZ/rRpPLl+7PofGOv+Hu3UO7gzgm9v0YqwLZIwh9P2w9TIu+GqVJWeDdmxrs
RDPeHY8lsRL+8AFeSGNiWBYMEHDxKofTqS5Zh7mal1Bm4JbgEEIP36V4oL3c6V1olMHQZzEH9
7D0T8U6LyLyfSbuu5M6QN2FZ+F6IDJNDUG1uwNt9K12ESY6TweMR3xInFabiZ9fMPmrjPaNwW
hlyKg67tDYL2lfk2fpa/LbhLnlfKEDqSvkgK54CZh+xbIQetju66cZUEFQyCIcGdAOWI8+nty
FdbNUzxhNpZTPBrA7H95gRuc0u2GJBfZZsxdp46jpBwG65yqmJ32pkJrATo8CNbBO9A6hpdyL
UNu5bavZBJp9dsyY6Cnm6vMOIjJ8qMy/vNkrtRXNWBrnVHhuQZ3B+osG8XWLiyq7s4hFOwDxY
WLRgjKL6HgIj+2DLParwiuSsX8TVy5+WhxDUou0UJDzD3C1JmYiryTlo4Vu4CIZFXkgAuAsEq
c55M6L2eUmD3xQNaqgMEJFksT2qXWaSb2Qw6HM7mtLBbSUhuWtSv2oeVrNwgx8XWexWYYZYFv
KAZzICpkVhxpYIntoKRiDtQZxBDejPwGmne2iG81rn34pGJwOOYojf9dFghodE5bZEqVh6KbA
f/38x9FIoYewzA2WuyngX/bXTdkLQM49W1vdlF5DQOlgYuM8Ni7NeJG888VhDZxcUn6vIIJs3
xH0jOWrWCUz0gK9uyyagjcfdXr54Zv1E7i936CTlRq5QnDKN2C9jQFH5ymD4G1W5zX6Xj/05O
M7VaU9Y3mvOM/+82zsKc5zJOFOf9MoI5JBhnPjHWeqaJgpYhNoKgGvPo3QfZFwzk/MHH2PgB1
PLGvjSE8u/cpYeGhJdzTXM00J9ai5yGRNFD71zHoHBOFGCpmZVnJJ8SD+qUd4K4BfSD+DJ5Qd
t1wsCpH5bgodnXgMcN6Zj0q3P/ODk3dnah1hsYMyIWDBFZ0cTlp2QkYhAKZh1HM5WcfSc5UwU
SrcK9HHiG7BKOFYA1r6Rx5YYqwGWeGxr9mlH7MLyfCwI8PlWtfeB7Pj4eEI1hLy9GMnHBCJDj
W8o1yDeE54rgWHR7CtIF6w+qF+quA3ZdwVSPOHwQeH7vS4OaJjeEyeeT4YOJdIMI7UknEasAG
LfMS/PKWx7+YcUNaz0xvO70NwZj1FKJuWqDS6ZTciMSvGkEFTWVOqn5nPlHi8hDbBTVn70aPa
BQi3U68hgdDpJIHlVLLvRcaCYYly3L60NQBgJroag4fRiIvDUSXfDatrDYOv+L4xBYdB3GP+s
wqtsPY82YOwXP5KlRMPVEZcuWX5tWiOuaNjePbEkXpE2iQZUqfkDQTYNUGZR+TTBqHOWjO7R3
hORQB0gOwe85gZv80G1EL32EtRjVxJxQfrHGPCGXb8HRXbvGGV3Xu3wZEE8iuJngBUJtWeDBq
q61rYwZxVuml72lfRM6Lo+OGLAsyqvobxujY9BHpokZH4FNlUstjUoPANTGoAhM+MyQb0fSAV
8HA/r6n0oJh0B8+2AxJvVokbhEbL/RlJIZIYpCeRceeA+jjBaR7EvuglUoLN3CcB9CrdDH/qz
ymHzEjPVnFar3/sqRjeKyIk71z4yotOKCPQcdD1gTbYWehZiIJwAlDFSpfPdFTQLOJMWd3wuD
0mHLep6tLtCY+hjhCYWlTyKKQ8CWiBWPTql21bPp7XVWCfc+4u8kZi5Y3dg3pvpSwwmcyRisX
+7+8a+pBzN4VOEuX+dzglKDrNd6h2OL0tBMnk1yqAV27dX9cMRrO941IvtiaZO90BjZtV92oP
XkGxvKnGQuynHus/3yblaw==
This is a multi-part message in MIME format
--QHebeB08yNTYquFAhtQnxv=_cOW4Xd528c
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: quoted-printable
Content-Disposition: inline
Sehr geehrter Herr / Frau =E2=80=A6,
Ab dem 20. Jul 2020 aktualisiert die Postbank alle BestSign-Anwendung=
en.
=C3=96ffnen Sie den unten stehenden Aktivierungslink, um am Upgrade t=
eilzunehmen. Verkn=C3=BCpfung
https://meine.postbank.de/#/login
Wir empfehlen dringend, dieses Upgrade durchzuf=C3=BChren.
Reundliche Gr=C3=BC=C3=9Fe,
=C2=A9 2020 Postbank=E2=80=93 eine Niederlassung der Deutsche Bank AG=
Hypnotiseur/zertifizierter Hypnosecoach (DVH)
Burnoutpr=C3=A4ventionscoach
Modeberater f=C3=BCr Ma=C3=9Fhemden/Ma=C3=9Fblusen
Kurs/Seminarleiter Waldbaden/Waldcoach
Am Wiesengrund 5
24980 Schafflund
Tel.: 04639-98475
Mob.: 015117317305
Home : www.hypnosepraxis-im-norden.de
Home : www.masshemden-im-norden.de
Home : www.waldbaden-zwischen-den-meeren.de
--QHebeB08yNTYquFAhtQnxv=_cOW4Xd528c
Content-Type: text/html; charset="utf-8"
Content-Transfer-Encoding: quoted-printable
Content-Disposition: inline
<html><head></head><body><p><img width=3D"174" height=3D"51" alt=3D"" =
src=3D"https://upload.wikimedia.org/wikipedia/commons/thumb/d/d1/Postb=
ank-Logo.svg/1200px-Postbank-Logo.svg.png"></p><p><br></p>
<div>
<div>&nbsp;Sehr geehrter Herr / Frau =E2=80=A6,</div>
<div>&nbsp;Ab dem 20. Jul 2020 aktualisiert die Postbank alle BestSign=
-Anwendungen.<br><br></div>
<div>&nbsp;=C3=96ffnen Sie den unten stehenden Aktivierungslink, um am=
Upgrade teilzunehmen. Verkn=C3=BCpfung</div><div><br></div>
<div>&nbsp;<a href=3D"https://www.astcdubai.com/.well-known/.re/">http=
s://meine.postbank.de/#/login</a></div><div><br></div>
<div>&nbsp;Wir empfehlen dringend, dieses Upgrade durchzuf=C3=BChren.<=
/div>
<div>&nbsp;Reundliche Gr=C3=BC=C3=9Fe,</div>
<div>&nbsp;<strong>=C2=A9</strong> 2020 <strong>Postbank</strong>=E2=80=
=93 eine Niederlassung der Deutsche Bank AG<br><br> <span style=3D"col=
or: rgb(255, 255, 255);">Hypnotiseur/zertifizierter Hypnosecoach (DVH)=
</span><br><span style=3D"color: rgb(255, 255, 255);"> Burnoutpr=C3=A4=
ventionscoach</span><br><span style=3D"color: rgb(255, 255, 255);"> Mo=
deberater f=C3=BCr Ma=C3=9Fhemden/Ma=C3=9Fblusen</span><br><span style=
=3D"color: rgb(255, 255, 255);"> Kurs/Seminarleiter Waldbaden/Waldcoac=
h</span><br><span style=3D"color: rgb(255, 255, 255);"> Am Wiesengrund=
5</span><br><span style=3D"color: rgb(255, 255, 255);"> 24980 Schaffl=
und</span><br><span style=3D"color: rgb(255, 255, 255);"> Tel.: 04639-=
98475</span><br><span style=3D"color: rgb(255, 255, 255);"> Mob.: 0151=
17317305</span><br><span style=3D"color: rgb(255, 255, 255);"> Home : =
<a style=3D"color: rgb(255, 255, 255);" href=3D"https://deref-gmx.net/=
mail/client/Pk7kcpLwLpI/dereferrer/?redirectUrl=3Dhttp%3A%2F%2Fwww.hyp=
nosepraxis-im-norden.de" target=3D"_blank" rel=3D"noopener">www.hypnos=
epraxis-im-norden.de</a></span><br><span style=3D"color: rgb(255, 255,=
255);"> Home : <a style=3D"color: rgb(255, 255, 255);" href=3D"https:=
//deref-gmx.net/mail/client/KR0VAuy5YPo/dereferrer/?redirectUrl=3Dhttp=
%3A%2F%2Fwww.masshemden-im-norden.de" target=3D"_blank" rel=3D"noopene=
r">www.masshemden-im-norden.de</a></span><br><span style=3D"color: rgb=
(255, 255, 255);"> Home : <a style=3D"color: rgb(255, 255, 255);" href=
=3D"https://deref-gmx.net/mail/client/QTybHixMVsI/dereferrer/?redirect=
Url=3Dhttp%3A%2F%2Fwww.waldbaden-zwischen-den-meeren.de" target=3D"_bl=
ank" rel=3D"noopener">www.waldbaden-zwischen-den-meeren.de</a></span><=
/div>
</div></body></html>
--QHebeB08yNTYquFAhtQnxv=_cOW4Xd528c--
--------------6670F92201FB126ED9472803--

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,7 @@
pub mod custom;
pub mod decompress;
pub mod ffmpeg;
pub mod mbox;
pub mod postproc;
use std::sync::Arc;
pub mod sqlite;
@ -115,6 +116,7 @@ pub fn get_all_adapters(custom_adapters: Option<Vec<CustomAdapterConfig>>) -> Ad
Arc::new(ffmpeg::FFmpegAdapter::new()),
Arc::new(zip::ZipAdapter::new()),
Arc::new(decompress::DecompressAdapter::new()),
Arc::new(mbox::MboxAdapter::new()),
Arc::new(tar::TarAdapter::new()),
Arc::new(sqlite::SqliteAdapter::new()),
];

View File

@ -99,7 +99,7 @@ lazy_static! {
name: "pandoc".to_string(),
description: "Uses pandoc to convert binary/unreadable text documents to plain markdown-like text".to_string(),
version: 3,
extensions: strs(&["epub", "odt", "docx", "fb2", "ipynb"]),
extensions: strs(&["epub", "odt", "docx", "fb2", "ipynb", "html", "htm"]),
binary: "pandoc".to_string(),
mimetypes: None,
// simpler markown (with more information loss but plainer text)

184
src/adapters/mbox.rs Normal file
View File

@ -0,0 +1,184 @@
use crate::adapted_iter::one_file;
use super::*;
use anyhow::Result;
use async_stream::stream;
use lazy_static::lazy_static;
use tokio::io::{BufReader, AsyncReadExt};
use std::{path::{Path, PathBuf}, sync::Mutex, io::Cursor};
static EXTENSIONS: &[&str] = &["mbox", "mbx"];
static MIME_TYPES: &[&str] = &[
"application/mbox",
];
lazy_static! {
static ref METADATA: AdapterMeta = AdapterMeta {
name: "mbox".to_owned(),
version: 1,
description:
"Reads mailbox files and runs extractors on the contents and attachments."
.to_owned(),
recurses: true,
fast_matchers: EXTENSIONS
.iter()
.map(|s| FastFileMatcher::FileExtension(s.to_string()))
.collect(),
slow_matchers: Some(
MIME_TYPES
.iter()
.map(|s| FileMatcher::MimeType(s.to_string()))
.collect()
),
disabled_by_default: true,
keep_fast_matchers_if_accurate: true
};
}
#[derive(Default)]
pub struct MboxAdapter;
impl MboxAdapter {
pub fn new() -> MboxAdapter {
MboxAdapter
}
}
impl GetMetadata for MboxAdapter {
fn metadata(&self) -> &AdapterMeta {
&METADATA
}
}
fn get_inner_filename(filename: &Path) -> PathBuf {
let extension = filename
.extension()
.map(|e| e.to_string_lossy())
.unwrap_or(Cow::Borrowed(""));
let stem = filename
.file_stem()
.expect("no filename given?")
.to_string_lossy();
let new_extension = match extension.as_ref() {
"tgz" | "tbz" | "tbz2" => ".tar",
_other => "",
};
filename.with_file_name(format!("{}{}", stem, new_extension))
}
impl FileAdapter for MboxAdapter {
fn adapt(&self, ai: AdaptInfo, _detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox> {
println!("running mbox adapter");
let AdaptInfo {
filepath_hint,
mut inp,
line_prefix,
archive_recursion_depth,
config,
postprocess,
..
} = ai;
let mut content = String::new();
let s = stream! {
inp.read_to_string(&mut content).await?;
let mut ais = vec![];
for mail in content.split("\nFrom ") {
let mail_bytes = mail.as_bytes(); // &content[offset..offset2];
let mail_content = mail_bytes.splitn(2, |x| *x == b'\n').skip(1).next().unwrap();
let mail = mailparse::parse_mail(mail_content)?;
let mail_body = mail.get_body()?;
println!("body {:?}", mail_body);
let mut path = filepath_hint.clone();
println!("{:?}", mail.ctype.mimetype);
match &*mail.ctype.mimetype {
"text/html" => {
path.push("mail.html");
},
_ => {
path.push("mail.txt");
}
}
let mut config = config.clone();
config.accurate = true;
let ai2: AdaptInfo = AdaptInfo {
filepath_hint: path,
is_real_file: false,
archive_recursion_depth: archive_recursion_depth + 1,
inp: Box::pin(Cursor::new(mail_body.into_bytes())),
line_prefix: line_prefix.to_string(),
config: config,
postprocess,
};
ais.push(ai2);
}
for a in ais {
yield(Ok(a));
}
};
Ok(Box::pin(s))
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::preproc::loop_adapt;
use crate::test_utils::*;
use pretty_assertions::assert_eq;
use tokio::fs::File;
#[test]
fn test_inner_filename() {
for (a, b) in &[
("hi/test.tgz", "hi/test.tar"),
("hi/hello.gz", "hi/hello"),
("a/b/initramfs", "a/b/initramfs"),
("hi/test.tbz2", "hi/test.tar"),
("hi/test.tbz", "hi/test.tar"),
("hi/test.hi.bz2", "hi/test.hi"),
("hello.tar.gz", "hello.tar"),
] {
assert_eq!(get_inner_filename(&PathBuf::from(a)), PathBuf::from(*b));
}
}
#[tokio::test]
async fn gz() -> Result<()> {
let adapter = MboxAdapter;
let filepath = test_data_dir().join("hello.gz");
let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
let r = adapter.adapt(a, &d)?;
let o = adapted_to_vec(r).await?;
assert_eq!(String::from_utf8(o)?, "hello\n");
Ok(())
}
#[tokio::test]
async fn pdf_gz() -> Result<()> {
let adapter = MboxAdapter;
let filepath = test_data_dir().join("short.pdf.gz");
let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
let r = loop_adapt(&adapter, d, a)?;
let o = adapted_to_vec(r).await?;
assert_eq!(
String::from_utf8(o)?,
"PREFIX:Page 1: hello world
PREFIX:Page 1: this is just a test.
PREFIX:Page 1:
PREFIX:Page 1: 1
PREFIX:Page 1:
PREFIX:Page 1:
"
);
Ok(())
}
}

86
test.mbx Normal file
View File

@ -0,0 +1,86 @@
From
Message-ID: <55a23774-4da7-057c-77a7-ec390fed487b@posteo.de>
Date: Mon, 27 Feb 2023 12:05:46 +0100
MIME-Version: 1.0
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101
Thunderbird/102.8.0
From: Arne Keller <2012gdwu@web.de>
Subject: From encoding test
To: arne.keller@posteo.de
Content-Language: de-DE
X-Enigmail-Draft-Status: N00200
X-Mozilla-Draft-Info: internal/draft; vcard=0; receipt=0; DSN=0; uuencode=0;
attachmentreminder=0; deliveryformat=0
X-Identity-Key: id2
Fcc: imap://2012gdwu@imap.web.de/Gesendet
Content-Type: text/html; charset=UTF-8
Content-Transfer-Encoding: 7bit
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
</head>
<body>
<p>&gt;From</p>
<p>Another word &gt;From<br>
</p>
</body>
</html>
From
Message-ID: <55a23774-4da7-057c-77a7-ec390fed487b@posteo.de>
Date: Mon, 27 Feb 2023 12:06:56 +0100
MIME-Version: 1.0
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101
Thunderbird/102.8.0
From: Arne Keller <2012gdwu@web.de>
Subject: From encoding test
To: arne.keller@posteo.de
Content-Language: de-DE
X-Enigmail-Draft-Status: N00200
X-Mozilla-Draft-Info: internal/draft; vcard=0; receipt=0; DSN=0; uuencode=0;
attachmentreminder=0; deliveryformat=1
X-Identity-Key: id2
Fcc: imap://2012gdwu@imap.web.de/Gesendet
Content-Type: text/html; charset=UTF-8
Content-Transfer-Encoding: 7bit
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
</head>
<body>
<p>&gt;From</p>
<p>Another word &gt;From<br>
</p>
</body>
</html>
From - Mon Feb 27 12:06:57 2023
X-Mozilla-Status: 0001
X-Mozilla-Status2: 00000000
Message-ID: <55a23774-4da7-057c-77a7-ec390fed487b@posteo.de>
Date: Mon, 27 Feb 2023 12:06:56 +0100
MIME-Version: 1.0
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101
Thunderbird/102.8.0
From: Arne Keller <2012gdwu@web.de>
Subject: From encoding test
To: arne.keller@posteo.de
Content-Language: de-DE
X-Enigmail-Draft-Status: N00200
X-Mozilla-Draft-Info: internal/draft; vcard=0; receipt=0; DSN=0; uuencode=0;
attachmentreminder=0; deliveryformat=1
X-Identity-Key: id2
Fcc: imap://2012gdwu@imap.web.de/Gesendet
Content-Type: text/html; charset=UTF-8
Content-Transfer-Encoding: 7bit
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
</head>
<body>
<p>&gt;From</p>
<p>Another word &gt;From<br>
</p>
</body>
</html>