Switch to wikipedia words

This commit is contained in:
Arne Keller 2022-02-28 19:30:49 +01:00
parent 85180cd926
commit 55b61ad858
5 changed files with 5000 additions and 2692 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,7 @@
#!/bin/sh #!/bin/sh
set -e set -e
word_file=deu_news_2021_100K-words.txt word_file=deu_wikipedia_2021_1M-words.txt
cat $word_file | head -n15000 | cut -f2 > top10000de_utf8.txt cat $word_file | head -n35000 | cut -f2 > top10000de_utf8.txt
awk '{ if (length($0) == 5) print tolower($0) }' top10000de_utf8.txt > top10000de_utf8_len5.txt awk '{ if (length($0) == 5) print tolower($0) }' top10000de_utf8.txt > top10000de_utf8_len5.txt
cat top10000de_utf8_len5.txt | rg "^([A-Za-z]|ä|ö|ü|Ä|Ö|Ü)+\$" | sort | uniq > top10000de_utf8_len5_filtered.txt cat top10000de_utf8_len5.txt | rg "^([A-Za-z]|ä|ö|ü|Ä|Ö|Ü)+\$" | sort | uniq > top10000de_utf8_len5_filtered.txt
comm -23 top10000de_utf8_len5_filtered.txt blacklist.txt > valid_words.txt comm -23 top10000de_utf8_len5_filtered.txt blacklist.txt > valid_words.txt

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff