qwörtle/filter.sh
2022-02-23 10:39:21 +01:00

14 lines
714 B
Bash
Executable File

#!/bin/sh
set -e
word_file=deu_news_2021_100K-words.txt
cat $word_file | head -n10000 | cut -f2 > top10000de_utf8.txt
awk '{ if (length($0) == 5) print }' top10000de_utf8.txt > top10000de_utf8_len5.txt
cat top10000de_utf8_len5.txt | rg "^([A-Za-z]|ä|ö|ü|Ä|Ö|Ü|ß)+\$" | sort > top10000de_utf8_len5_filtered.txt
comm -23 top10000de_utf8_len5_filtered.txt blacklist.txt > valid_words.txt
cat $word_file | cut -f2 | awk '{ if (length($0) == 5) print }' - | rg "^([A-Za-z]|ä|ö|ü|Ä|Ö|Ü|ß)+\$" | sort > valid_guesses.txt
comm -23 valid_guesses.txt blacklist.txt > valid_guesses2.txt
mv valid_guesses2.txt valid_guesses.txt
rm top10000de_utf8.txt top10000de_utf8_len5.txt top10000de_utf8_len5_filtered.txt