2022-02-22 21:53:28 +00:00
|
|
|
#!/bin/sh
|
2022-02-23 09:38:57 +00:00
|
|
|
set -e
|
2022-02-28 18:30:49 +00:00
|
|
|
word_file=deu_wikipedia_2021_1M-words.txt
|
|
|
|
cat $word_file | head -n35000 | cut -f2 > top10000de_utf8.txt
|
2022-02-23 10:52:11 +00:00
|
|
|
awk '{ if (length($0) == 5) print tolower($0) }' top10000de_utf8.txt > top10000de_utf8_len5.txt
|
2022-02-28 18:05:31 +00:00
|
|
|
cat top10000de_utf8_len5.txt | rg "^([A-Za-z]|ä|ö|ü|Ä|Ö|Ü)+\$" | sort | uniq > top10000de_utf8_len5_filtered.txt
|
2022-02-23 09:38:57 +00:00
|
|
|
comm -23 top10000de_utf8_len5_filtered.txt blacklist.txt > valid_words.txt
|
|
|
|
|
2022-02-28 18:32:48 +00:00
|
|
|
cat $word_file | cut -f2 | awk '{ if (length($0) == 5) print tolower($0) }' - | rg "^([A-Za-z]|ä|ö|ü|Ä|Ö|Ü)+\$" | sort | uniq > valid_guesses.txt
|
2022-02-23 09:38:57 +00:00
|
|
|
comm -23 valid_guesses.txt blacklist.txt > valid_guesses2.txt
|
|
|
|
mv valid_guesses2.txt valid_guesses.txt
|
|
|
|
|
|
|
|
rm top10000de_utf8.txt top10000de_utf8_len5.txt top10000de_utf8_len5_filtered.txt
|