Switch to wikipedia words

This commit is contained in:
Arne Keller 2022-02-28 19:30:49 +01:00
parent 85180cd926
commit 55b61ad858
5 changed files with 5000 additions and 2692 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,7 @@
#!/bin/sh
set -e
word_file=deu_news_2021_100K-words.txt
cat $word_file | head -n15000 | cut -f2 > top10000de_utf8.txt
word_file=deu_wikipedia_2021_1M-words.txt
cat $word_file | head -n35000 | cut -f2 > top10000de_utf8.txt
awk '{ if (length($0) == 5) print tolower($0) }' top10000de_utf8.txt > top10000de_utf8_len5.txt
cat top10000de_utf8_len5.txt | rg "^([A-Za-z]|ä|ö|ü|Ä|Ö|Ü)+\$" | sort | uniq > top10000de_utf8_len5_filtered.txt
comm -23 top10000de_utf8_len5_filtered.txt blacklist.txt > valid_words.txt

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff