Normalize word case to avoid duplicates

This commit is contained in:
Arne Keller 2022-02-23 11:52:11 +01:00
parent ecc598d897
commit 0fd07d7d30
5 changed files with 6110 additions and 6108 deletions

View File

@ -1,190 +1,190 @@
Alaba
Alain
Allen
Alpen
Anton
Apple
Armin
Aston
Athen
Basel
Bayer
Bernd
Biden
Biker
Björn
Black
Boris
Brite
Causa
Chats
Chefs
Chile
China
Chips
Chris
Claus
Cloud
Coach
Costa
Covid
Dänen
David
Deals
Derby
Dhabi
Diana
Diess
Donau
Dubai
Eifel
Esken
Event
alaba
alain
allen
alpen
anton
apple
armin
aston
athen
basel
bayer
bernd
biden
biker
björn
black
boris
brite
causa
chats
chefs
chile
china
chips
chris
claus
cloud
coach
costa
covid
dänen
david
deals
derby
dhabi
diana
diess
donau
dubai
eifel
esken
event
facto
Felix
First
FOCUS
Fonds
Frank
Franz
Fritz
Front
Fulda
Fürth
Games
Gange
Gates
Georg
Gotha
Grand
Green
felix
first
focus
fonds
frank
franz
fritz
front
fulda
fürth
games
gange
gates
georg
gotha
grand
green
gross
Group
Guido
Hagen
Hamas
Hanau
Hansi
Harry
Hartz
Heiko
Heinz
Helge
group
guido
hagen
hamas
hanau
hansi
harry
hartz
heiko
heinz
helge
heuer
heuer
Heuer
hiess
Hofer
Horst
House
Huber
Image
Intel
Jakob
James
Japan
Jason
Jesus
Jones
Josef
Juden
Julia
Kabul
Karin
Katar
Katja
Kevin
Kickl
Klaus
Kleve
Klopp
Krems
Kuntz
Laura
Lewis
Leyen
hofer
horst
house
huber
image
intel
jakob
james
japan
jason
jesus
jones
josef
juden
julia
kabul
karin
katar
katja
kevin
kickl
klaus
kleve
klopp
krems
kuntz
laura
lewis
leyen
liess
Linda
Lions
Louis
Lucas
Lukas
Maier
Mainz
Marco
Maria
Marie
Mario
Marko
Match
Mayer
Media
Meyer
Minsk
Music
Nadal
Novak
Obama
Paris
Party
Patch
Pauli
Pence
Peter
Petra
Pilot
Point
Polen
Power
Prime
Putin
Queen
Ralph
Rapid
Remis
Rhein
Robin
Roger
Rossi
Route
Ryzen
Sankt
Sarah
Silva
Simon
Smart
Söder
Songs
Sound
Space
Spahn
SpVgg
Stars
Steam
Stiko
Store
Story
Swiss
Teams
Texas
Thiem
Times
Tirol
Tokio
Trend
Trier
Trump
Tweet
Union
Urban
Verdi
Voice
Weber
linda
lions
louis
lucas
lukas
maier
mainz
marco
maria
marie
mario
marko
match
mayer
media
meyer
minsk
music
nadal
novak
obama
paris
party
patch
pauli
pence
peter
petra
pilot
point
polen
power
prime
putin
queen
ralph
rapid
remis
rhein
robin
roger
rossi
route
ryzen
sankt
sarah
silva
simon
smart
söder
songs
sound
space
spahn
spvgg
stars
steam
stiko
store
story
swiss
teams
texas
thiem
times
tirol
tokio
trend
trier
trump
tweet
union
urban
verdi
voice
weber
weiss
Weiss
Wesel
Wiens
Willi
Wings
WKStA
Wolff
World
weiss
wesel
wiens
willi
wings
wksta
wolff
world

View File

@ -2,11 +2,11 @@
set -e
word_file=deu_news_2021_100K-words.txt
cat $word_file | head -n10000 | cut -f2 > top10000de_utf8.txt
awk '{ if (length($0) == 5) print }' top10000de_utf8.txt > top10000de_utf8_len5.txt
awk '{ if (length($0) == 5) print tolower($0) }' top10000de_utf8.txt > top10000de_utf8_len5.txt
cat top10000de_utf8_len5.txt | rg "^([A-Za-z]|ä|ö|ü|Ä|Ö|Ü)+\$" | sort > top10000de_utf8_len5_filtered.txt
comm -23 top10000de_utf8_len5_filtered.txt blacklist.txt > valid_words.txt
cat $word_file | cut -f2 | awk '{ if (length($0) == 5) print }' - | rg "^([A-Za-z]|ä|ö|ü|Ä|Ö|Ü)+\$" | sort > valid_guesses.txt
cat $word_file | cut -f2 | awk '{ if (length($0) == 5) print tolower($0) }' - | rg "^([A-Za-z]|ä|ö|ü|Ä|Ö|Ü)+\$" | sort > valid_guesses.txt
comm -23 valid_guesses.txt blacklist.txt > valid_guesses2.txt
mv valid_guesses2.txt valid_guesses.txt

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff