import json
import numpy as np
import pandas as pd
import hvplot.pandas # noqa
from tqdm.notebook import tqdm
import multiprocessing as mp
from functools import reduce
from pathlib import Path
pd.options.plotting.backend = 'holoviews'
Due to some oddities in the way that notebooks handle multiprocessing, we will stash some of our functions in a separate helper file
import wordle_helpers
from wordle_helpers import make_charray, guess, possible_words, entropy_reduction
The list of words is hardcoded in the wordle source, and I've saved them as a .json object where game
is a list of the words actually used as words in the game and legal
is a list of the words that are legal to play in the game. For the sake of all the analyses that follow we'll just consider the game
words, even though the player doesn't know which words are possible correct words a-priori, this reduces the number of computations we have to do with our janky blog-level code.
with open('wordle_words.json', 'r') as wfile:
words = json.load(wfile)
game, legal = words['game'], words['legal']
print(game[0:5])
print(f'Game words: {len(game)}, guessable words: {len(legal)}')
['cigar', 'rebut', 'sissy', 'humph', 'awake'] Game words: 2315, guessable words: 10657
How many times does each letter occur in each position? One way to think about a best first word is in a "letterwise" way: given how frequently each letter appears in each position, which word is the series of letters that maximizes the possibility for each of them (independently) to be correct?
First, make a function to split words into a 2-dimensional array, word x letter position
def make_charray(words:list[str]) -> np.ndarray:
"""split words into n x 5 character array"""
game_char = [[*word] for word in game]
charray = np.array(game_char)
return charray
charray = make_charray(game)
charray[0:5,:]
array([['c', 'i', 'g', 'a', 'r'], ['r', 'e', 'b', 'u', 't'], ['s', 'i', 's', 's', 'y'], ['h', 'u', 'm', 'p', 'h'], ['a', 'w', 'a', 'k', 'e']], dtype='<U1')
Then, we iterate through the 5 letter positions, using the numpy.unique
function to count the number of times each unique letter appears. We make a tuple of (position, letter, count)
for each character in each position, and combine that into a pandas DataFrame.
tups = []
for i, col in enumerate(charray.T):
letters, counts = np.unique(col, return_counts=True)
# make tuples of (position, letter, count)
tups.extend([(i, letter, count) for letter, count in zip(letters, counts)])
print(tups[0:5])
[(0, 'a', 141), (0, 'b', 173), (0, 'c', 198), (0, 'd', 111), (0, 'e', 72)]
# recombine in a pandas dataframe
position_freqs = pd.DataFrame(tups, columns=["position", "letter", "count"])
position_freqs.head()
position | letter | count | |
---|---|---|---|
0 | 0 | a | 141 |
1 | 0 | b | 173 |
2 | 0 | c | 198 |
3 | 0 | d | 111 |
4 | 0 | e | 72 |
Now we can vizualize the frequency of each letter in each position as a heatmap
position_freqs.hvplot.heatmap(x='position', y='letter', C='count',
height=500, width=300,flip_yaxis=True)
Finally, for each word, we count the total number of times each of its letters appears in that position, and find the minimum and maximum.
word_scores = []
for word in game:
score = 0
for i, letter in enumerate(word):
score += position_freqs[
(position_freqs['position'] == i) & (position_freqs['letter'] == letter)
]['count'].values[0]
word_scores.append((word, score))
word_scores = pd.DataFrame(word_scores, columns=["word", "score"])
The worst words, predictably, have a bunch of rare letters in rare positions -- eg "nymph" (but remember this analysis doesn't consider letter combinations specifically)
word_scores = word_scores.sort_values(by="score")
word_scores.head()
word | score | |
---|---|---|
284 | nymph | 310 |
2005 | inbox | 318 |
1511 | ethos | 326 |
2039 | affix | 340 |
1966 | umbra | 344 |
and the best words have common letters in common positions. Given the overwhelming frequency of words that start with "s" and end with "e," or "y," that's mostly what we get here.
word_scores.tail(20)
word | score | |
---|---|---|
528 | stale | 1336 |
1607 | saner | 1339 |
1741 | sense | 1342 |
1777 | cease | 1342 |
305 | slave | 1344 |
1421 | saucy | 1351 |
212 | shire | 1352 |
1478 | shone | 1360 |
2172 | soapy | 1366 |
481 | saint | 1371 |
1729 | crane | 1378 |
1704 | suite | 1381 |
2270 | shine | 1382 |
1530 | sooty | 1392 |
1004 | share | 1393 |
275 | saute | 1398 |
1543 | shale | 1403 |
1627 | slice | 1409 |
2032 | sauce | 1411 |
878 | slate | 1437 |
word_scores.hist('score', bins=30)
WARNING:param.main: column option not found for hist plot; similar options include: []
According to this method, the best first guess is slate
Across all possible correct words, which word gives us the largest median decrease in the number of possible remaining words? The first method doesn't take into account the iterative nature of the game, and how partial "orange" matches tell you a given letter is in the word, but not in the position it was guessed in.
For this method, we make a 'fake' guess and return an array like the kind the game gives you:
0 == wrong
1 == present, but wrong position
2 == correct
in order to calculate the words that are still possible given the answer to the first guess. Then we'll find the word that reduces the most possible remaining words across all possible correct words.
First, to write our 'guess' function:
def guess(word:str, correct:str) -> np.ndarray:
"""
return the status of each character in a guess, given a correct word.
0 == wrong,
1 == present, but wrong position
2 == correct
"""
assert len(word) == len(correct)
ret = []
for word_letter, correct_letter in zip(word, correct):
if word_letter == correct_letter:
ret.append(2)
elif word_letter in correct:
ret.append(1)
else:
ret.append(0)
return np.array(ret)
guess('luger', 'query')
array([0, 2, 0, 1, 1])
From that, we write a function that gives us the remaining possible words, given a guessed word and a correct word.
We do this in four parts:
guess
function to figure out what the game would have told us about our guess. If all of our letters were wrong, we return the list of words (since all are still possible)logical_and
to find only those words who have all of the letters in the correct positions.logical_and
of those as well (each word should have every partial match)and
of both of those arrays since possible words should have the correct letters and partial match lettersdef possible_words(words:list[str], word:str, correct:str,
charray:np.ndarray=None, wordarray=None) -> list[str]:
if charray is None:
charray = make_charray(words)
if wordarray is None:
wordarray = np.array(words)
# if we guess the correct word, there is only one possible word.
if word == correct:
return [word]
guessed = guess(word, correct)
# step 1: return wordlist if no matches
if sum(guessed) == 0:
return words
# step 2: filter to words that have correct letter in correct position
correct_mask = np.ones(charray.shape[0], dtype=bool)
correct_idx = np.where(guessed == 2)[0]
for idx in correct_idx:
correct_mask = np.logical_and(
correct_mask,
charray[:,idx] == correct[idx]
)
# step 3: filter to words that contain any partial matches
partial_letters = [word[i] for i in np.where(guessed == 1)[0]]
partial_mask = np.ones(charray.shape[0], dtype=bool)
for letter in partial_letters:
partial_mask = np.logical_and(
partial_mask,
np.any(charray == letter, axis=1)
)
# step 3.5 remove the guessed word itself
correct_mask[wordarray == word] = False
# step 4: combine masks
possible_mask = np.logical_and(correct_mask, partial_mask)
possible_words = wordarray[possible_mask].tolist()
return possible_words
Now we need to wrap this function twice to compute it for all guesses x all correct words:
entropy_reduction
finds, for a given correct word, the amount of words that are allowed for each possible word to guessglobal_entropy
then calculates the entropy_reduction
for all correct words.We use a multiprocessing pool to speed up the calculation of global_entropy
because i didn't spend any time optimizing this code lmao
def entropy_reduction(correct:str, words:list[str]=game, return_words=False, pbar=False, reverse=False) -> pd.DataFrame:
"""
get the number of possible words remaining after guessing every word,
given a correct word
Args:
return_words (bool): If True, return an additional column `'possible_words'`
that is a list of the words themselves
pbar (bool): If True, use tqdm to show a progress bar
reverse (bool): if True, swap treament of 'guessed' and 'correct' words
to get the remaining words if you guessed a given word against every possible
correct word. (default False: calculate entropy reduced by all guesses for
a given correct word)
"""
charray = make_charray(words)
wordarray = np.array(words)
if pbar:
_pbar = tqdm(total=len(words), position=0)
guesses = []
for word in words:
if reverse:
poss_words = possible_words(words, correct, word, charray, wordarray)
else:
poss_words = possible_words(words, word, correct, charray, wordarray)
if return_words:
guesses.append((word, correct, len(poss_words), poss_words))
else:
guesses.append((word, correct, len(poss_words)))
if pbar:
_pbar.update()
if return_words:
return pd.DataFrame(guesses, columns=["word", "correct", "possible", "possible_words"])
else:
return pd.DataFrame(guesses, columns=["word", "correct", "possible"])
def global_entropy(words:list[str], procs:int=12, chunksize:int=10) -> pd.DataFrame:
"""
For all words, find the word that most reduces the entropy on the first play
"""
pbar = tqdm(total=len(words), position=1)
with mp.Pool(procs) as pool:
results = pool.imap_unordered(
wordle_helpers.entropy_reduction,
words,
chunksize=chunksize
)
dfs = []
for res in results:
dfs.append(res)
pbar.update()
return dfs
For one word, our entropy_reduction
function gives us the best words to guess -- for example, given today's word 'query'
...
query = entropy_reduction('query', game, pbar=True, return_words=True)
query.sort_values('possible').head()
0%| | 0/2315 [00:00<?, ?it/s]
word | correct | possible | possible_words | |
---|---|---|---|---|
205 | query | query | 1 | [query] |
698 | queer | query | 1 | [query] |
1094 | buyer | query | 1 | [query] |
860 | azure | query | 1 | [query] |
1359 | fiery | query | 3 | [query, leery, every] |
The entropy reduction can be a bit odd to parse -- I can imagine "queer" strongly constraining "query," missing just the y and having an r out of position, but "azure" is less intuitive. Azure filters all words that contain "a," and indicates that "u," "r," and "e" are present, but not in those positions (and recall that "e"s most common position is at the end of words). Even though "fiery" has more shared letters with "query," since more words end in "ery" that don't contain "f" or "i", there are two additional available words.
So the entropy reduction approach might give you the most information in a technical sense, it's not always possible to use that information intuitively since it implicitly depends on knowing the space of possible words.
Now we'll compute the entropy across all correct words. Since it takes a bit, I've saved the pickle and will redistribute it along with this post.
if Path('global_entropy.pck.xz').exists():
ent = pd.read_pickle('global_entropy.pck.xz')
else:
ent = global_entropy(game, procs=15, chunksize=20)
ent = pd.concat(ent)
ent.to_pickle('global_entropy.pck.xz')
ent
word | correct | possible | |
---|---|---|---|
0 | cigar | pilot | 201 |
1 | rebut | pilot | 252 |
2 | sissy | pilot | 201 |
3 | humph | pilot | 345 |
4 | awake | pilot | 2315 |
... | ... | ... | ... |
2310 | judge | hello | 1055 |
2311 | rower | hello | 208 |
2312 | artsy | hello | 2315 |
2313 | rural | hello | 647 |
2314 | shave | hello | 135 |
5359225 rows × 3 columns
This gives us a DataFrame with the number of possible
words that remain after guessing each word
, given the correct
word.
ent.head()
word | correct | possible | |
---|---|---|---|
0 | cigar | pilot | 201 |
1 | rebut | pilot | 252 |
2 | sissy | pilot | 201 |
3 | humph | pilot | 345 |
4 | awake | pilot | 2315 |
Then we group by each guessed word
and find the lowest median value of possible words -- these are the words that give us the most information on a first guess
med_ent = ent.groupby('word').median('possible').sort_values('possible')
med_ent.head()
possible | |
---|---|
word | |
canoe | 208.0 |
stair | 211.0 |
rinse | 217.0 |
tenor | 217.0 |
siren | 217.0 |
med_ent.tail()
possible | |
---|---|
word | |
mummy | 2315.0 |
fluff | 2315.0 |
mamma | 2315.0 |
bobby | 2315.0 |
pygmy | 2315.0 |
Weird! That's a completely different list. Let's take a look at why canoe is such a good word by taking a look at the possible words that remain after a guess, and by using the reverse
flag on entropy_reduction
, which guesses one word against all possible correct words (rather than guessing all words against a single correct word)
canoe = entropy_reduction('canoe', game,
return_words = True, reverse=True,
pbar=True)
canoe = canoe[canoe['possible']>1]
0%| | 0/2315 [00:00<?, ?it/s]
canoe.sort_values('possible').head(50)
word | correct | possible | possible_words | |
---|---|---|---|---|
1925 | bacon | canoe | 2 | [canon, bacon] |
669 | lance | canoe | 2 | [lance, dance] |
1514 | decor | canoe | 2 | [decor, decoy] |
1391 | carol | canoe | 2 | [canon, carol] |
699 | venom | canoe | 2 | [venom, tenor] |
354 | manor | canoe | 2 | [manor, canon] |
1771 | decoy | canoe | 2 | [decor, decoy] |
2085 | clone | canoe | 2 | [clone, crone] |
1828 | tenor | canoe | 2 | [venom, tenor] |
1581 | dance | canoe | 2 | [lance, dance] |
2243 | crone | canoe | 2 | [clone, crone] |
761 | clean | canoe | 2 | [clean, crane] |
25 | colon | canoe | 2 | [colon, canon] |
112 | alone | canoe | 3 | [alone, atone, anode] |
1317 | annoy | canoe | 3 | [manor, canon, annoy] |
1899 | anode | canoe | 3 | [alone, atone, anode] |
1936 | coven | canoe | 3 | [coven, clone, crone] |
369 | atone | canoe | 3 | [alone, atone, anode] |
703 | acorn | canoe | 4 | [acorn, ocean, canon, bacon] |
1268 | mange | canoe | 4 | [lance, range, mange, dance] |
593 | chaos | canoe | 4 | [chaos, canon, carol, cocoa] |
331 | canny | canoe | 4 | [canny, candy, canon, canal] |
1418 | scone | canoe | 4 | [scone, ounce, clone, crone] |
1238 | conch | canoe | 4 | [conic, canon, conch, condo] |
1555 | cocoa | canoe | 4 | [chaos, canon, carol, cocoa] |
810 | range | canoe | 4 | [lance, range, mange, dance] |
824 | scion | canoe | 4 | [colon, scion, canon, bacon] |
2285 | condo | canoe | 4 | [conic, canon, conch, condo] |
825 | candy | canoe | 4 | [canny, candy, canon, canal] |
175 | conic | canoe | 4 | [conic, canon, conch, condo] |
1531 | canal | canoe | 4 | [canny, candy, canon, canal] |
1304 | havoc | canoe | 4 | [canon, havoc, carol, bacon] |
611 | carve | canoe | 5 | [carve, caste, cause, cache, cable] |
1337 | cache | canoe | 5 | [carve, caste, cause, cache, cable] |
1084 | caste | canoe | 5 | [carve, caste, cause, cache, cable] |
1574 | mango | canoe | 5 | [manor, canon, mango, tango, banjo] |
373 | cacao | canoe | 5 | [cargo, cacao, canon, carol, cameo] |
948 | naive | canoe | 5 | [lance, range, naive, mange, dance] |
311 | cargo | canoe | 5 | [cargo, cacao, canon, carol, cameo] |
2109 | banjo | canoe | 5 | [manor, canon, mango, tango, banjo] |
1288 | cause | canoe | 5 | [carve, caste, cause, cache, cable] |
1926 | cable | canoe | 5 | [carve, caste, cause, cache, cable] |
2088 | tango | canoe | 5 | [manor, canon, mango, tango, banjo] |
1686 | cairn | canoe | 6 | [canny, candy, cabin, canon, canal, cairn] |
66 | panel | canoe | 6 | [panel, lance, range, mange, dance, saner] |
1607 | saner | canoe | 6 | [panel, lance, range, mange, dance, saner] |
979 | cabin | canoe | 6 | [canny, candy, cabin, canon, canal, cairn] |
1108 | enact | canoe | 7 | [lance, clean, ocean, enact, dance, crane, pecan] |
1786 | color | canoe | 7 | [colon, chaos, crook, canon, carol, cocoa, color] |
2082 | pecan | canoe | 7 | [lance, clean, ocean, enact, dance, crane, pecan] |
Just by eye, it looks like canoe
is useful for indicating a number of words that have some combination of 'a'
, 'n'
, 'o'
, and 'e'
somewhere in the word. Since it's a letter with 3 of the most common vowels, it also tells you a lot about words that don't have one or more of them. The words that canoe doesn't reduce entropy for are unremarkable: they don't have any shared letters. What seems to make canoe a good word is its ability to reduce uncertainty with a few partially overlapping criteria. Looking at the histogram of number of words remaining after guessing canoe, there are 216 for which no information gained, but then the next highest bin has just under half the possible words remaining. Most words have fewer than a quarter of all words still possible.
canoe.hist('possible', bins=30)
WARNING:param.main: column option not found for hist plot; similar options include: []
The words in that half reduction bin are words that share an 'e' and no other letter.
canoe[np.logical_and(canoe['possible']>1000, canoe['possible'] < 2000)].head(20)
word | correct | possible | possible_words | |
---|---|---|---|---|
1 | rebut | canoe | 1055 | [rebut, awake, evade, serve, heath, model, gra... |
16 | quiet | canoe | 1055 | [rebut, awake, evade, serve, heath, model, gra... |
22 | fresh | canoe | 1055 | [rebut, awake, evade, serve, heath, model, gra... |
32 | helix | canoe | 1055 | [rebut, awake, evade, serve, heath, model, gra... |
37 | whelp | canoe | 1055 | [rebut, awake, evade, serve, heath, model, gra... |
53 | flesh | canoe | 1055 | [rebut, awake, evade, serve, heath, model, gra... |
58 | belly | canoe | 1055 | [rebut, awake, evade, serve, heath, model, gra... |
60 | seedy | canoe | 1055 | [rebut, awake, evade, serve, heath, model, gra... |
74 | bleed | canoe | 1055 | [rebut, awake, evade, serve, heath, model, gra... |
85 | greet | canoe | 1055 | [rebut, awake, evade, serve, heath, model, gra... |
95 | islet | canoe | 1055 | [rebut, awake, evade, serve, heath, model, gra... |
114 | hyper | canoe | 1055 | [rebut, awake, evade, serve, heath, model, gra... |
121 | tweed | canoe | 1055 | [rebut, awake, evade, serve, heath, model, gra... |
125 | steed | canoe | 1055 | [rebut, awake, evade, serve, heath, model, gra... |
143 | fixer | canoe | 1055 | [rebut, awake, evade, serve, heath, model, gra... |
154 | surer | canoe | 1055 | [rebut, awake, evade, serve, heath, model, gra... |
168 | exult | canoe | 1055 | [rebut, awake, evade, serve, heath, model, gra... |
169 | usher | canoe | 1055 | [rebut, awake, evade, serve, heath, model, gra... |
191 | ferry | canoe | 1055 | [rebut, awake, evade, serve, heath, model, gra... |
196 | rebus | canoe | 1055 | [rebut, awake, evade, serve, heath, model, gra... |
What about the other letters? A naive hypothesis might be that they're just the most frequent characters, but this isn't necessarily the case. To be maximally entropically reducing, they need to each give independent information, and the most frequently used characters might be too general to draw distinctions with. What do i know i'm no ~ scientist ~
First, we remind ourselves how frequent our characters are compared to all characters
char_counts = np.unique(charray, return_counts=True)
# char_counts[1].shape
char_counts = np.column_stack(char_counts)
char_counts = pd.DataFrame(char_counts, columns=['letter', 'count'])
char_counts['count'] = pd.to_numeric(char_counts['count'])
char_counts[char_counts['letter'].isin(['c', 'a', 'n', 'o', 'e'])].sort_values('count')
letter | count | |
---|---|---|
2 | c | 477 |
13 | n | 575 |
14 | o | 754 |
0 | a | 979 |
4 | e | 1233 |
char_counts.hist('count', bins=15)
WARNING:param.main: column option not found for hist plot; similar options include: []
So 'c'
is interesting, because it's not an altogether common letter, but it is most common at the beginning of the word. So if a word has 'c'
it's likely in the first position and you'll get it right, and if it doesn't you get to rule out words rather than having to guess where else 'c'
goes. The vowels are a bit different since they happen in more places in the word more frequently, but being able to diagnose which vowels are present in the word and which are in the correct/incorrect positions gives you a lot of information.
Our old friend 'slate'
is actually not far behind in entropy reduction
slate = entropy_reduction('slate', game,
return_words = True, reverse=True,
pbar=True)
slate = slate[slate['possible']>1]
0%| | 0/2315 [00:00<?, ?it/s]
slate['possible'].median()
276.0
slate.hist('possible', bins=30)
WARNING:param.main: column option not found for hist plot; similar options include: []
As a final measurement, we can see how many correct words have at least one letter from the best and worst performing words from our two prior measurements. Unsurprisingly, the best words give you at least some information on the first guess.
def words_containing(word:str, charray:np.ndarray=charray, do_print=False):
contain_bools = []
for letter in word:
contain_bools.append(np.sum(charray==letter, axis=1))
contain_word = np.column_stack(contain_bools)
contain_word = np.sum(contain_word, axis=1)
contain = np.sum(contain_word > 0)
if do_print:
percent = '{:.2f}%'.format(contain*100/charray.shape[0])
print(f'words containing {word}: {percent} ({contain}/{charray.shape[0]})')
words_containing('canoe', do_print=True)
words containing canoe: 90.67% (2099/2315)
words_containing('slate', do_print=True)
words containing slate: 90.45% (2094/2315)
words_containing('vivid', do_print=True)
words containing vivid: 42.63% (987/2315)
words_containing('nymph', do_print=True)
words containing nymph: 65.49% (1516/2315)