Improve the regexp priority

This commit is contained in:
Ferran Basora 2019-02-21 12:48:16 +00:00
parent 8c738f16b3
commit a4d9ffab78
2 changed files with 47 additions and 12 deletions

View File

@ -32,24 +32,34 @@ impl<'a> State<'a> {
let mut matches = Vec::new(); let mut matches = Vec::new();
let mut patterns = Vec::new(); let mut patterns = Vec::new();
// TODO: Improve pattern preference patterns.push(Regex::new(r"((https?://|git@|git://|ssh://|ftp://|file:///)[\w?=%/_.:,;~@!#$&()*+-]*)").unwrap()); // Urls
patterns.push(Regex::new("((^|^\\.|[[:space:]]|[[:space:]]\\.|[[:space:]]\\.\\.|^\\.\\.)[[:alnum:]~_-]*/\\[\\][[:alnum:]_.#$%&+=/@-]+)").unwrap()); // Paths patterns.push(Regex::new(r"/[^ ]+").unwrap()); // Paths
patterns.push(Regex::new(r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}").unwrap()); // Uid patterns.push(Regex::new(r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}").unwrap()); // Uid
patterns.push(Regex::new(r"[0-9a-f]{7,40}").unwrap()); // Sha id patterns.push(Regex::new(r"[0-9a-f]{7,40}").unwrap()); // Sha id
// patterns.push(Regex::new(r"[0-9]{4,}").unwrap()); // Process or ports
patterns.push(Regex::new(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}").unwrap()); // Ip address patterns.push(Regex::new(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}").unwrap()); // Ip address
patterns.push(Regex::new(r"((https?://|git@|git://|ssh://|ftp://|file:///)[\w?=%/_.:,;~@!#$&()*+-]*)").unwrap()); // Urls patterns.push(Regex::new(r"0x[0-9a-fA-F]+").unwrap()); // Address
patterns.push(Regex::new(r"(0x[0-9a-fA-F]+)").unwrap()); // Address patterns.push(Regex::new(r"[0-9]{4,}").unwrap()); // Process or ports
for (index, line) in self.lines.iter().enumerate() { for (index, line) in self.lines.iter().enumerate() {
for pattern in patterns.iter() { let mut chunk: &str = line;
for mat in pattern.find_iter(line) { let mut offset: i32 = 0;
loop {
let submatches = patterns.iter().filter_map(|pattern| pattern.find_iter(chunk).nth(0)).collect::<Vec<_>>();
let first_match_option = submatches.iter().min_by(|x, y| x.start().cmp(&y.start()));
if let Some(first_match) = first_match_option {
matches.push(Match{ matches.push(Match{
x: mat.start() as i32, x: offset + first_match.start() as i32,
y: index as i32, y: index as i32,
text: &line[mat.start()..mat.end()], text: &chunk[first_match.start()..first_match.end()],
hint: None hint: None
}); });
chunk = chunk.get(first_match.end()..).unwrap();
offset = offset + first_match.end() as i32;
} else {
break;
} }
} }
} }
@ -58,7 +68,9 @@ impl<'a> State<'a> {
let mut hints = alphabet.hints(matches.len()); let mut hints = alphabet.hints(matches.len());
for mat in &mut matches { for mat in &mut matches {
mat.hint = Some(hints.pop().unwrap().to_string().clone()) if let Some(hint) = hints.pop() {
mat.hint = Some(hint.to_string().clone())
}
} }
return matches; return matches;
@ -97,7 +109,7 @@ mod tests {
fn match_uids () { fn match_uids () {
let output = "Lorem ipsum 123e4567-e89b-12d3-a456-426655440000 lorem\n Lorem lorem lorem"; let output = "Lorem ipsum 123e4567-e89b-12d3-a456-426655440000 lorem\n Lorem lorem lorem";
assert_ne!(match_lines(output).len(), 1); // FIXME regex priority assert_eq!(match_lines(output).len(), 1);
} }
#[test] #[test]
@ -125,6 +137,27 @@ mod tests {
fn match_addresses () { fn match_addresses () {
let output = "Lorem 0xfd70b5695 0x5246ddf lorem\n Lorem 0x973113 lorem"; let output = "Lorem 0xfd70b5695 0x5246ddf lorem\n Lorem 0x973113 lorem";
assert_ne!(match_lines(output).len(), 3); // FIXME regex priority assert_eq!(match_lines(output).len(), 3);
}
#[test]
fn match_process_port () {
let output = "Lorem 5695 52463 lorem\n Lorem 973113 lorem 99999 lorem 8888 lorem\n 23456 lorem 5432 lorem 23444";
assert_eq!(match_lines(output).len(), 8);
}
#[test]
fn priority () {
let output = "Lorem /var/fd70b569/9999.log 52463 lorem\n Lorem 973113 lorem 123e4567-e89b-12d3-a456-426655440000 lorem 8888 lorem\n https://crates.io/23456/fd70b569 lorem";
// Matches
// /var/fd70b569/9999.log
// 52463
// 973113
// 123e4567-e89b-12d3-a456-426655440000
// 8888
// https://crates.io/23456/fd70b569
assert_eq!(match_lines(output).len(), 6);
} }
} }

2
test
View File

@ -10,6 +10,8 @@ dolor mi, ac faucibus justo maximus eu. Donec vestibulum lorem semper,
ultricies libero 5734d923afb8 efc9061663 ac, imperdiet ex. Morbi ultricies libero 5734d923afb8 efc9061663 ac, imperdiet ex. Morbi
123e4567-e89b-12d3-a456-426655440000 tempor mollis condimentum. 123e4567-e89b-12d3-a456-426655440000 tempor mollis condimentum.
lorem /var/log/123e4567/999.log lorem /etc/system.conf lorem
10.3.23.42 lorem 123.2.3.4 lorem 230.23.33.34 10.3.23.42 lorem 123.2.3.4 lorem 230.23.33.34
10.3.23.42 lorem 123.2.3.4 lorem 230.23.33.34 10.3.23.42 lorem 123.2.3.4 lorem 230.23.33.34
10.3.23.42 lorem 123.2.3.4 lorem 230.23.33.34 10.3.23.42 lorem 123.2.3.4 lorem 230.23.33.34