Switch to a better dictionary source

This commit is contained in:
Arne Keller 2022-02-23 10:38:57 +01:00
parent f481edd84c
commit 0958719977
7 changed files with 174472 additions and 1004 deletions

1
README.md Normal file
View File

@ -0,0 +1 @@
Source of word list: https://wortschatz.uni-leipzig.de/de/download/German

190
blacklist.txt Normal file
View File

@ -0,0 +1,190 @@
Alaba
Alain
Allen
Alpen
Anton
Apple
Armin
Aston
Athen
Basel
Bayer
Bernd
Biden
Biker
Björn
Black
Boris
Brite
Causa
Chats
Chefs
Chile
China
Chips
Chris
Claus
Cloud
Coach
Costa
Covid
Dänen
David
Deals
Derby
Dhabi
Diana
Diess
Donau
Dubai
Eifel
Esken
Event
facto
Felix
First
FOCUS
Fonds
Frank
Franz
Fritz
Front
Fulda
Fürth
Games
Gange
Gates
Georg
Gotha
Grand
Green
gross
Group
Guido
Hagen
Hamas
Hanau
Hansi
Harry
Hartz
Heiko
Heinz
Helge
heuer
Heuer
hiess
Hofer
Horst
House
Huber
Image
Intel
Jakob
James
Japan
Jason
Jesus
Jones
Josef
Juden
Julia
Kabul
Karin
Katar
Katja
Kevin
Kickl
Klaus
Kleve
Klopp
Krems
Kuntz
Laura
Lewis
Leyen
liess
Linda
Lions
Louis
Lucas
Lukas
Maier
Mainz
Marco
Maria
Marie
Mario
Marko
Match
Mayer
Media
Meyer
Minsk
Music
Nadal
Novak
Obama
Paris
Party
Patch
Pauli
Pence
Peter
Petra
Pilot
Point
Polen
Power
Prime
Putin
Queen
Ralph
Rapid
Remis
Rhein
Robin
Roger
Rossi
Route
Ryzen
Sankt
Sarah
Silva
Simon
Smart
Söder
Songs
Sound
Space
Spahn
SpVgg
Stars
Steam
Stiko
Store
Story
Swiss
Teams
Texas
Thiem
Times
Tirol
Tokio
Trend
Trier
Trump
Tweet
Union
Urban
Verdi
Voice
Weber
weiss
Weiss
Wesel
Wiens
Willi
Wings
WKStA
Wolff
World

166730
deu_news_2021_100K-words.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,13 @@
#!/bin/sh
iconv -f ISO-8859-1 -t UTF-8 top10000de.txt > top10000de_utf8.txt
set -e
word_file=deu_news_2021_100K-words.txt
cat $word_file | head -n10000 | cut -f2 > top10000de_utf8.txt
awk '{ if (length($0) == 5) print }' top10000de_utf8.txt > top10000de_utf8_len5.txt
cat top10000de_utf8_len5.txt | rg --invert-match "'|ß" > top10000de_utf8_len5_filtered.txt
cat top10000de_utf8_len5.txt | rg "^([A-Za-z]|ä|ö|ü|Ä|Ö|Ü|ß)+\$" | sort > top10000de_utf8_len5_filtered.txt
comm -23 top10000de_utf8_len5_filtered.txt blacklist.txt > valid_words.txt
cat $word_file | cut -f2 | awk '{ if (length($0) == 5) print }' - | rg "^([A-Za-z]|ä|ö|ü|Ä|Ö|Ü|ß)+\$" | sort > valid_guesses.txt
comm -23 valid_guesses.txt blacklist.txt > valid_guesses2.txt
mv valid_guesses2.txt valid_guesses.txt
rm top10000de_utf8.txt top10000de_utf8_len5.txt top10000de_utf8_len5_filtered.txt

File diff suppressed because one or more lines are too long

6714
valid_guesses.txt Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff