Normalize word case to avoid duplicates
This commit is contained in:
parent
ecc598d897
commit
0fd07d7d30
368
blacklist.txt
368
blacklist.txt
@ -1,190 +1,190 @@
|
|||||||
Alaba
|
alaba
|
||||||
Alain
|
alain
|
||||||
Allen
|
allen
|
||||||
Alpen
|
alpen
|
||||||
Anton
|
anton
|
||||||
Apple
|
apple
|
||||||
Armin
|
armin
|
||||||
Aston
|
aston
|
||||||
Athen
|
athen
|
||||||
Basel
|
basel
|
||||||
Bayer
|
bayer
|
||||||
Bernd
|
bernd
|
||||||
Biden
|
biden
|
||||||
Biker
|
biker
|
||||||
Björn
|
björn
|
||||||
Black
|
black
|
||||||
Boris
|
boris
|
||||||
Brite
|
brite
|
||||||
Causa
|
causa
|
||||||
Chats
|
chats
|
||||||
Chefs
|
chefs
|
||||||
Chile
|
chile
|
||||||
China
|
china
|
||||||
Chips
|
chips
|
||||||
Chris
|
chris
|
||||||
Claus
|
claus
|
||||||
Cloud
|
cloud
|
||||||
Coach
|
coach
|
||||||
Costa
|
costa
|
||||||
Covid
|
covid
|
||||||
Dänen
|
dänen
|
||||||
David
|
david
|
||||||
Deals
|
deals
|
||||||
Derby
|
derby
|
||||||
Dhabi
|
dhabi
|
||||||
Diana
|
diana
|
||||||
Diess
|
diess
|
||||||
Donau
|
donau
|
||||||
Dubai
|
dubai
|
||||||
Eifel
|
eifel
|
||||||
Esken
|
esken
|
||||||
Event
|
event
|
||||||
facto
|
facto
|
||||||
Felix
|
felix
|
||||||
First
|
first
|
||||||
FOCUS
|
focus
|
||||||
Fonds
|
fonds
|
||||||
Frank
|
frank
|
||||||
Franz
|
franz
|
||||||
Fritz
|
fritz
|
||||||
Front
|
front
|
||||||
Fulda
|
fulda
|
||||||
Fürth
|
fürth
|
||||||
Games
|
games
|
||||||
Gange
|
gange
|
||||||
Gates
|
gates
|
||||||
Georg
|
georg
|
||||||
Gotha
|
gotha
|
||||||
Grand
|
grand
|
||||||
Green
|
green
|
||||||
gross
|
gross
|
||||||
Group
|
group
|
||||||
Guido
|
guido
|
||||||
Hagen
|
hagen
|
||||||
Hamas
|
hamas
|
||||||
Hanau
|
hanau
|
||||||
Hansi
|
hansi
|
||||||
Harry
|
harry
|
||||||
Hartz
|
hartz
|
||||||
Heiko
|
heiko
|
||||||
Heinz
|
heinz
|
||||||
Helge
|
helge
|
||||||
|
heuer
|
||||||
heuer
|
heuer
|
||||||
Heuer
|
|
||||||
hiess
|
hiess
|
||||||
Hofer
|
hofer
|
||||||
Horst
|
horst
|
||||||
House
|
house
|
||||||
Huber
|
huber
|
||||||
Image
|
image
|
||||||
Intel
|
intel
|
||||||
Jakob
|
jakob
|
||||||
James
|
james
|
||||||
Japan
|
japan
|
||||||
Jason
|
jason
|
||||||
Jesus
|
jesus
|
||||||
Jones
|
jones
|
||||||
Josef
|
josef
|
||||||
Juden
|
juden
|
||||||
Julia
|
julia
|
||||||
Kabul
|
kabul
|
||||||
Karin
|
karin
|
||||||
Katar
|
katar
|
||||||
Katja
|
katja
|
||||||
Kevin
|
kevin
|
||||||
Kickl
|
kickl
|
||||||
Klaus
|
klaus
|
||||||
Kleve
|
kleve
|
||||||
Klopp
|
klopp
|
||||||
Krems
|
krems
|
||||||
Kuntz
|
kuntz
|
||||||
Laura
|
laura
|
||||||
Lewis
|
lewis
|
||||||
Leyen
|
leyen
|
||||||
liess
|
liess
|
||||||
Linda
|
linda
|
||||||
Lions
|
lions
|
||||||
Louis
|
louis
|
||||||
Lucas
|
lucas
|
||||||
Lukas
|
lukas
|
||||||
Maier
|
maier
|
||||||
Mainz
|
mainz
|
||||||
Marco
|
marco
|
||||||
Maria
|
maria
|
||||||
Marie
|
marie
|
||||||
Mario
|
mario
|
||||||
Marko
|
marko
|
||||||
Match
|
match
|
||||||
Mayer
|
mayer
|
||||||
Media
|
media
|
||||||
Meyer
|
meyer
|
||||||
Minsk
|
minsk
|
||||||
Music
|
music
|
||||||
Nadal
|
nadal
|
||||||
Novak
|
novak
|
||||||
Obama
|
obama
|
||||||
Paris
|
paris
|
||||||
Party
|
party
|
||||||
Patch
|
patch
|
||||||
Pauli
|
pauli
|
||||||
Pence
|
pence
|
||||||
Peter
|
peter
|
||||||
Petra
|
petra
|
||||||
Pilot
|
pilot
|
||||||
Point
|
point
|
||||||
Polen
|
polen
|
||||||
Power
|
power
|
||||||
Prime
|
prime
|
||||||
Putin
|
putin
|
||||||
Queen
|
queen
|
||||||
Ralph
|
ralph
|
||||||
Rapid
|
rapid
|
||||||
Remis
|
remis
|
||||||
Rhein
|
rhein
|
||||||
Robin
|
robin
|
||||||
Roger
|
roger
|
||||||
Rossi
|
rossi
|
||||||
Route
|
route
|
||||||
Ryzen
|
ryzen
|
||||||
Sankt
|
sankt
|
||||||
Sarah
|
sarah
|
||||||
Silva
|
silva
|
||||||
Simon
|
simon
|
||||||
Smart
|
smart
|
||||||
Söder
|
söder
|
||||||
Songs
|
songs
|
||||||
Sound
|
sound
|
||||||
Space
|
space
|
||||||
Spahn
|
spahn
|
||||||
SpVgg
|
spvgg
|
||||||
Stars
|
stars
|
||||||
Steam
|
steam
|
||||||
Stiko
|
stiko
|
||||||
Store
|
store
|
||||||
Story
|
story
|
||||||
Swiss
|
swiss
|
||||||
Teams
|
teams
|
||||||
Texas
|
texas
|
||||||
Thiem
|
thiem
|
||||||
Times
|
times
|
||||||
Tirol
|
tirol
|
||||||
Tokio
|
tokio
|
||||||
Trend
|
trend
|
||||||
Trier
|
trier
|
||||||
Trump
|
trump
|
||||||
Tweet
|
tweet
|
||||||
Union
|
union
|
||||||
Urban
|
urban
|
||||||
Verdi
|
verdi
|
||||||
Voice
|
voice
|
||||||
Weber
|
weber
|
||||||
weiss
|
weiss
|
||||||
Weiss
|
weiss
|
||||||
Wesel
|
wesel
|
||||||
Wiens
|
wiens
|
||||||
Willi
|
willi
|
||||||
Wings
|
wings
|
||||||
WKStA
|
wksta
|
||||||
Wolff
|
wolff
|
||||||
World
|
world
|
||||||
|
@ -2,11 +2,11 @@
|
|||||||
set -e
|
set -e
|
||||||
word_file=deu_news_2021_100K-words.txt
|
word_file=deu_news_2021_100K-words.txt
|
||||||
cat $word_file | head -n10000 | cut -f2 > top10000de_utf8.txt
|
cat $word_file | head -n10000 | cut -f2 > top10000de_utf8.txt
|
||||||
awk '{ if (length($0) == 5) print }' top10000de_utf8.txt > top10000de_utf8_len5.txt
|
awk '{ if (length($0) == 5) print tolower($0) }' top10000de_utf8.txt > top10000de_utf8_len5.txt
|
||||||
cat top10000de_utf8_len5.txt | rg "^([A-Za-z]|ä|ö|ü|Ä|Ö|Ü)+\$" | sort > top10000de_utf8_len5_filtered.txt
|
cat top10000de_utf8_len5.txt | rg "^([A-Za-z]|ä|ö|ü|Ä|Ö|Ü)+\$" | sort > top10000de_utf8_len5_filtered.txt
|
||||||
comm -23 top10000de_utf8_len5_filtered.txt blacklist.txt > valid_words.txt
|
comm -23 top10000de_utf8_len5_filtered.txt blacklist.txt > valid_words.txt
|
||||||
|
|
||||||
cat $word_file | cut -f2 | awk '{ if (length($0) == 5) print }' - | rg "^([A-Za-z]|ä|ö|ü|Ä|Ö|Ü)+\$" | sort > valid_guesses.txt
|
cat $word_file | cut -f2 | awk '{ if (length($0) == 5) print tolower($0) }' - | rg "^([A-Za-z]|ä|ö|ü|Ä|Ö|Ü)+\$" | sort > valid_guesses.txt
|
||||||
comm -23 valid_guesses.txt blacklist.txt > valid_guesses2.txt
|
comm -23 valid_guesses.txt blacklist.txt > valid_guesses2.txt
|
||||||
mv valid_guesses2.txt valid_guesses.txt
|
mv valid_guesses2.txt valid_guesses.txt
|
||||||
|
|
||||||
|
10931
valid_guesses.txt
10931
valid_guesses.txt
File diff suppressed because it is too large
Load Diff
910
valid_words.txt
910
valid_words.txt
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user