Switch to a better dictionary source
This commit is contained in:
parent
f481edd84c
commit
0958719977
1
README.md
Normal file
1
README.md
Normal file
@ -0,0 +1 @@
|
||||
Source of word list: https://wortschatz.uni-leipzig.de/de/download/German
|
190
blacklist.txt
Normal file
190
blacklist.txt
Normal file
@ -0,0 +1,190 @@
|
||||
Alaba
|
||||
Alain
|
||||
Allen
|
||||
Alpen
|
||||
Anton
|
||||
Apple
|
||||
Armin
|
||||
Aston
|
||||
Athen
|
||||
Basel
|
||||
Bayer
|
||||
Bernd
|
||||
Biden
|
||||
Biker
|
||||
Björn
|
||||
Black
|
||||
Boris
|
||||
Brite
|
||||
Causa
|
||||
Chats
|
||||
Chefs
|
||||
Chile
|
||||
China
|
||||
Chips
|
||||
Chris
|
||||
Claus
|
||||
Cloud
|
||||
Coach
|
||||
Costa
|
||||
Covid
|
||||
Dänen
|
||||
David
|
||||
Deals
|
||||
Derby
|
||||
Dhabi
|
||||
Diana
|
||||
Diess
|
||||
Donau
|
||||
Dubai
|
||||
Eifel
|
||||
Esken
|
||||
Event
|
||||
facto
|
||||
Felix
|
||||
First
|
||||
FOCUS
|
||||
Fonds
|
||||
Frank
|
||||
Franz
|
||||
Fritz
|
||||
Front
|
||||
Fulda
|
||||
Fürth
|
||||
Games
|
||||
Gange
|
||||
Gates
|
||||
Georg
|
||||
Gotha
|
||||
Grand
|
||||
Green
|
||||
gross
|
||||
Group
|
||||
Guido
|
||||
Hagen
|
||||
Hamas
|
||||
Hanau
|
||||
Hansi
|
||||
Harry
|
||||
Hartz
|
||||
Heiko
|
||||
Heinz
|
||||
Helge
|
||||
heuer
|
||||
Heuer
|
||||
hiess
|
||||
Hofer
|
||||
Horst
|
||||
House
|
||||
Huber
|
||||
Image
|
||||
Intel
|
||||
Jakob
|
||||
James
|
||||
Japan
|
||||
Jason
|
||||
Jesus
|
||||
Jones
|
||||
Josef
|
||||
Juden
|
||||
Julia
|
||||
Kabul
|
||||
Karin
|
||||
Katar
|
||||
Katja
|
||||
Kevin
|
||||
Kickl
|
||||
Klaus
|
||||
Kleve
|
||||
Klopp
|
||||
Krems
|
||||
Kuntz
|
||||
Laura
|
||||
Lewis
|
||||
Leyen
|
||||
liess
|
||||
Linda
|
||||
Lions
|
||||
Louis
|
||||
Lucas
|
||||
Lukas
|
||||
Maier
|
||||
Mainz
|
||||
Marco
|
||||
Maria
|
||||
Marie
|
||||
Mario
|
||||
Marko
|
||||
Match
|
||||
Mayer
|
||||
Media
|
||||
Meyer
|
||||
Minsk
|
||||
Music
|
||||
Nadal
|
||||
Novak
|
||||
Obama
|
||||
Paris
|
||||
Party
|
||||
Patch
|
||||
Pauli
|
||||
Pence
|
||||
Peter
|
||||
Petra
|
||||
Pilot
|
||||
Point
|
||||
Polen
|
||||
Power
|
||||
Prime
|
||||
Putin
|
||||
Queen
|
||||
Ralph
|
||||
Rapid
|
||||
Remis
|
||||
Rhein
|
||||
Robin
|
||||
Roger
|
||||
Rossi
|
||||
Route
|
||||
Ryzen
|
||||
Sankt
|
||||
Sarah
|
||||
Silva
|
||||
Simon
|
||||
Smart
|
||||
Söder
|
||||
Songs
|
||||
Sound
|
||||
Space
|
||||
Spahn
|
||||
SpVgg
|
||||
Stars
|
||||
Steam
|
||||
Stiko
|
||||
Store
|
||||
Story
|
||||
Swiss
|
||||
Teams
|
||||
Texas
|
||||
Thiem
|
||||
Times
|
||||
Tirol
|
||||
Tokio
|
||||
Trend
|
||||
Trier
|
||||
Trump
|
||||
Tweet
|
||||
Union
|
||||
Urban
|
||||
Verdi
|
||||
Voice
|
||||
Weber
|
||||
weiss
|
||||
Weiss
|
||||
Wesel
|
||||
Wiens
|
||||
Willi
|
||||
Wings
|
||||
WKStA
|
||||
Wolff
|
||||
World
|
166730
deu_news_2021_100K-words.txt
Normal file
166730
deu_news_2021_100K-words.txt
Normal file
File diff suppressed because it is too large
Load Diff
13
filter.sh
13
filter.sh
@ -1,4 +1,13 @@
|
||||
#!/bin/sh
|
||||
iconv -f ISO-8859-1 -t UTF-8 top10000de.txt > top10000de_utf8.txt
|
||||
set -e
|
||||
word_file=deu_news_2021_100K-words.txt
|
||||
cat $word_file | head -n10000 | cut -f2 > top10000de_utf8.txt
|
||||
awk '{ if (length($0) == 5) print }' top10000de_utf8.txt > top10000de_utf8_len5.txt
|
||||
cat top10000de_utf8_len5.txt | rg --invert-match "'|ß" > top10000de_utf8_len5_filtered.txt
|
||||
cat top10000de_utf8_len5.txt | rg "^([A-Za-z]|ä|ö|ü|Ä|Ö|Ü|ß)+\$" | sort > top10000de_utf8_len5_filtered.txt
|
||||
comm -23 top10000de_utf8_len5_filtered.txt blacklist.txt > valid_words.txt
|
||||
|
||||
cat $word_file | cut -f2 | awk '{ if (length($0) == 5) print }' - | rg "^([A-Za-z]|ä|ö|ü|Ä|Ö|Ü|ß)+\$" | sort > valid_guesses.txt
|
||||
comm -23 valid_guesses.txt blacklist.txt > valid_guesses2.txt
|
||||
mv valid_guesses2.txt valid_guesses.txt
|
||||
|
||||
rm top10000de_utf8.txt top10000de_utf8_len5.txt top10000de_utf8_len5_filtered.txt
|
||||
|
6714
valid_guesses.txt
Normal file
6714
valid_guesses.txt
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user