Normalize word case to avoid duplicates

This commit is contained in:
Arne Keller 2022-02-23 11:52:11 +01:00
parent ecc598d897
commit 0fd07d7d30
5 changed files with 6110 additions and 6108 deletions

View File

@ -1,190 +1,190 @@
Alaba alaba
Alain alain
Allen allen
Alpen alpen
Anton anton
Apple apple
Armin armin
Aston aston
Athen athen
Basel basel
Bayer bayer
Bernd bernd
Biden biden
Biker biker
Björn björn
Black black
Boris boris
Brite brite
Causa causa
Chats chats
Chefs chefs
Chile chile
China china
Chips chips
Chris chris
Claus claus
Cloud cloud
Coach coach
Costa costa
Covid covid
Dänen dänen
David david
Deals deals
Derby derby
Dhabi dhabi
Diana diana
Diess diess
Donau donau
Dubai dubai
Eifel eifel
Esken esken
Event event
facto facto
Felix felix
First first
FOCUS focus
Fonds fonds
Frank frank
Franz franz
Fritz fritz
Front front
Fulda fulda
Fürth fürth
Games games
Gange gange
Gates gates
Georg georg
Gotha gotha
Grand grand
Green green
gross gross
Group group
Guido guido
Hagen hagen
Hamas hamas
Hanau hanau
Hansi hansi
Harry harry
Hartz hartz
Heiko heiko
Heinz heinz
Helge helge
heuer
heuer heuer
Heuer
hiess hiess
Hofer hofer
Horst horst
House house
Huber huber
Image image
Intel intel
Jakob jakob
James james
Japan japan
Jason jason
Jesus jesus
Jones jones
Josef josef
Juden juden
Julia julia
Kabul kabul
Karin karin
Katar katar
Katja katja
Kevin kevin
Kickl kickl
Klaus klaus
Kleve kleve
Klopp klopp
Krems krems
Kuntz kuntz
Laura laura
Lewis lewis
Leyen leyen
liess liess
Linda linda
Lions lions
Louis louis
Lucas lucas
Lukas lukas
Maier maier
Mainz mainz
Marco marco
Maria maria
Marie marie
Mario mario
Marko marko
Match match
Mayer mayer
Media media
Meyer meyer
Minsk minsk
Music music
Nadal nadal
Novak novak
Obama obama
Paris paris
Party party
Patch patch
Pauli pauli
Pence pence
Peter peter
Petra petra
Pilot pilot
Point point
Polen polen
Power power
Prime prime
Putin putin
Queen queen
Ralph ralph
Rapid rapid
Remis remis
Rhein rhein
Robin robin
Roger roger
Rossi rossi
Route route
Ryzen ryzen
Sankt sankt
Sarah sarah
Silva silva
Simon simon
Smart smart
Söder söder
Songs songs
Sound sound
Space space
Spahn spahn
SpVgg spvgg
Stars stars
Steam steam
Stiko stiko
Store store
Story story
Swiss swiss
Teams teams
Texas texas
Thiem thiem
Times times
Tirol tirol
Tokio tokio
Trend trend
Trier trier
Trump trump
Tweet tweet
Union union
Urban urban
Verdi verdi
Voice voice
Weber weber
weiss weiss
Weiss weiss
Wesel wesel
Wiens wiens
Willi willi
Wings wings
WKStA wksta
Wolff wolff
World world

View File

@ -2,11 +2,11 @@
set -e set -e
word_file=deu_news_2021_100K-words.txt word_file=deu_news_2021_100K-words.txt
cat $word_file | head -n10000 | cut -f2 > top10000de_utf8.txt cat $word_file | head -n10000 | cut -f2 > top10000de_utf8.txt
awk '{ if (length($0) == 5) print }' top10000de_utf8.txt > top10000de_utf8_len5.txt awk '{ if (length($0) == 5) print tolower($0) }' top10000de_utf8.txt > top10000de_utf8_len5.txt
cat top10000de_utf8_len5.txt | rg "^([A-Za-z]|ä|ö|ü|Ä|Ö|Ü)+\$" | sort > top10000de_utf8_len5_filtered.txt cat top10000de_utf8_len5.txt | rg "^([A-Za-z]|ä|ö|ü|Ä|Ö|Ü)+\$" | sort > top10000de_utf8_len5_filtered.txt
comm -23 top10000de_utf8_len5_filtered.txt blacklist.txt > valid_words.txt comm -23 top10000de_utf8_len5_filtered.txt blacklist.txt > valid_words.txt
cat $word_file | cut -f2 | awk '{ if (length($0) == 5) print }' - | rg "^([A-Za-z]|ä|ö|ü|Ä|Ö|Ü)+\$" | sort > valid_guesses.txt cat $word_file | cut -f2 | awk '{ if (length($0) == 5) print tolower($0) }' - | rg "^([A-Za-z]|ä|ö|ü|Ä|Ö|Ü)+\$" | sort > valid_guesses.txt
comm -23 valid_guesses.txt blacklist.txt > valid_guesses2.txt comm -23 valid_guesses.txt blacklist.txt > valid_guesses2.txt
mv valid_guesses2.txt valid_guesses.txt mv valid_guesses2.txt valid_guesses.txt

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff