import json
import numpy as np
import pandas as pd
import hvplot.pandas  # noqa
from tqdm.notebook import tqdm
import multiprocessing as mp
from functools import reduce
from pathlib import Path

pd.options.plotting.backend = 'holoviews'


import wordle_helpers
from wordle_helpers import make_charray, guess, possible_words, entropy_reduction


with open('wordle_words.json', 'r') as wfile:
    words = json.load(wfile)
    
game, legal = words['game'], words['legal']


print(game[0:5])
print(f'Game words: {len(game)}, guessable words: {len(legal)}')

['cigar', 'rebut', 'sissy', 'humph', 'awake']
Game words: 2315, guessable words: 10657


def make_charray(words:list[str]) -> np.ndarray:
    """split words into n x 5 character array"""
    game_char = [[*word] for word in game]
    charray = np.array(game_char)
    return charray
charray = make_charray(game)


charray[0:5,:]

array([['c', 'i', 'g', 'a', 'r'],
       ['r', 'e', 'b', 'u', 't'],
       ['s', 'i', 's', 's', 'y'],
       ['h', 'u', 'm', 'p', 'h'],
       ['a', 'w', 'a', 'k', 'e']], dtype='<U1')


tups = []
for i, col in enumerate(charray.T):
    letters, counts = np.unique(col, return_counts=True)
    # make tuples of (position, letter, count)
    tups.extend([(i, letter, count) for letter, count in zip(letters, counts)])
    
print(tups[0:5])

[(0, 'a', 141), (0, 'b', 173), (0, 'c', 198), (0, 'd', 111), (0, 'e', 72)]


# recombine in a pandas dataframe
position_freqs = pd.DataFrame(tups, columns=["position", "letter", "count"])
position_freqs.head()


position_freqs.hvplot.heatmap(x='position', y='letter', C='count', 
                  height=500, width=300,flip_yaxis=True)


word_scores = []
for word in game:
    score = 0
    for i, letter in enumerate(word):
        score += position_freqs[
            (position_freqs['position'] == i) & (position_freqs['letter'] == letter)
        ]['count'].values[0]
    word_scores.append((word, score))
    
word_scores = pd.DataFrame(word_scores, columns=["word", "score"])


word_scores = word_scores.sort_values(by="score")
word_scores.head()


word_scores.tail(20)


word_scores.hist('score', bins=30)

WARNING:param.main: column option not found for hist plot; similar options include: []


def guess(word:str, correct:str) -> np.ndarray:
    """
    return the status of each character in a guess, given a correct word.
    0 == wrong,
    1 == present, but wrong position
    2 == correct
    """
    assert len(word) == len(correct)
    ret = []
    for word_letter, correct_letter in zip(word, correct):
        if word_letter == correct_letter:
            ret.append(2)
        elif word_letter in correct:
            ret.append(1)
        else:
            ret.append(0)
    return np.array(ret)


guess('luger', 'query')

array([0, 2, 0, 1, 1])


def possible_words(words:list[str], word:str, correct:str,
                  charray:np.ndarray=None, wordarray=None) -> list[str]:
    if charray is None:
        charray = make_charray(words)
    if wordarray is None:
        wordarray     = np.array(words)
        
    # if we guess the correct word, there is only one possible word.
    if word == correct:
        return [word]
    
    guessed       = guess(word, correct)
    
    # step 1: return wordlist if no matches
    if sum(guessed) == 0:
        return words
    
    
    # step 2: filter to words that have correct letter in correct position
    correct_mask = np.ones(charray.shape[0], dtype=bool)
    correct_idx = np.where(guessed == 2)[0]
    for idx in correct_idx:
        correct_mask = np.logical_and(
            correct_mask,
            charray[:,idx] == correct[idx]
        )
        
    # step 3: filter to words that contain any partial matches
    partial_letters = [word[i] for i in np.where(guessed == 1)[0]]
    partial_mask = np.ones(charray.shape[0], dtype=bool)
    for letter in partial_letters:
        partial_mask = np.logical_and(
            partial_mask,
            np.any(charray == letter, axis=1)
        )
        
    # step 3.5 remove the guessed word itself
    correct_mask[wordarray == word] = False
        
    # step 4: combine masks
    possible_mask = np.logical_and(correct_mask, partial_mask)
    possible_words = wordarray[possible_mask].tolist()
        
    return possible_words


def entropy_reduction(correct:str, words:list[str]=game, return_words=False, pbar=False, reverse=False) -> pd.DataFrame:
    """
    get the number of possible words remaining after guessing every word, 
    given a correct word
    
    Args:
        return_words (bool): If True, return an additional column `'possible_words'` 
            that is a list of the words themselves
        pbar (bool): If True, use tqdm to show a progress bar
        reverse (bool): if True, swap treament of 'guessed' and 'correct' words
            to get the remaining words if you guessed a given word against every possible
            correct word. (default False: calculate entropy reduced by all guesses for
            a given correct word)
    """
    charray = make_charray(words)
    wordarray = np.array(words)
    if pbar:
        _pbar = tqdm(total=len(words), position=0)

    guesses = []
    for word in words:
        if reverse:
            poss_words = possible_words(words, correct, word, charray, wordarray)
        else:
            poss_words = possible_words(words, word, correct, charray, wordarray)
            
        if return_words:
            guesses.append((word, correct, len(poss_words), poss_words))
        else:
            guesses.append((word, correct, len(poss_words)))
            
        if pbar:
            _pbar.update()
    
    if return_words:
        return pd.DataFrame(guesses, columns=["word", "correct", "possible", "possible_words"])
    else:
        return pd.DataFrame(guesses, columns=["word", "correct", "possible"])
    
def global_entropy(words:list[str], procs:int=12, chunksize:int=10) -> pd.DataFrame:
    """
    For all words, find the word that most reduces the entropy on the first play
    """
    
    pbar = tqdm(total=len(words), position=1)
    with mp.Pool(procs) as pool:
        results = pool.imap_unordered(
            wordle_helpers.entropy_reduction, 
            words, 
            chunksize=chunksize
        )
        dfs = []
        for res in results:
            dfs.append(res)
            pbar.update()
    return dfs


query = entropy_reduction('query', game, pbar=True, return_words=True)
query.sort_values('possible').head()

  0%|          | 0/2315 [00:00<?, ?it/s]


if Path('global_entropy.pck.xz').exists():
    ent = pd.read_pickle('global_entropy.pck.xz')
else:
    ent = global_entropy(game, procs=15, chunksize=20)
    ent = pd.concat(ent)
    ent.to_pickle('global_entropy.pck.xz')

ent


ent.head()


med_ent = ent.groupby('word').median('possible').sort_values('possible')
med_ent.head()


med_ent.tail()


canoe = entropy_reduction('canoe', game, 
                          return_words = True, reverse=True,
                          pbar=True)
canoe = canoe[canoe['possible']>1]

  0%|          | 0/2315 [00:00<?, ?it/s]


canoe.sort_values('possible').head(50)


canoe.hist('possible', bins=30)

WARNING:param.main: column option not found for hist plot; similar options include: []


canoe[np.logical_and(canoe['possible']>1000, canoe['possible'] < 2000)].head(20)


char_counts = np.unique(charray, return_counts=True)
# char_counts[1].shape
char_counts = np.column_stack(char_counts)
char_counts = pd.DataFrame(char_counts, columns=['letter', 'count'])
char_counts['count'] = pd.to_numeric(char_counts['count'])


char_counts[char_counts['letter'].isin(['c', 'a', 'n', 'o', 'e'])].sort_values('count')


char_counts.hist('count', bins=15)

WARNING:param.main: column option not found for hist plot; similar options include: []


slate = entropy_reduction('slate', game, 
                          return_words = True, reverse=True,
                          pbar=True)
slate = slate[slate['possible']>1]

  0%|          | 0/2315 [00:00<?, ?it/s]


slate['possible'].median()

276.0


slate.hist('possible', bins=30)

WARNING:param.main: column option not found for hist plot; similar options include: []


def words_containing(word:str, charray:np.ndarray=charray, do_print=False):

    contain_bools = []
    for letter in word:
        contain_bools.append(np.sum(charray==letter, axis=1))
    contain_word = np.column_stack(contain_bools)
    contain_word = np.sum(contain_word, axis=1)
    contain = np.sum(contain_word > 0)
    if do_print:
        percent = '{:.2f}%'.format(contain*100/charray.shape[0])
        print(f'words containing {word}: {percent} ({contain}/{charray.shape[0]})')


words_containing('canoe', do_print=True)

words containing canoe: 90.67% (2099/2315)


words_containing('slate', do_print=True)

words containing slate: 90.45% (2094/2315)


words_containing('vivid', do_print=True)

words containing vivid: 42.63% (987/2315)


words_containing('nymph', do_print=True)

words containing nymph: 65.49% (1516/2315)

	word	score
284	nymph	310
2005	inbox	318
1511	ethos	326
2039	affix	340
1966	umbra	344

	word	score
528	stale	1336
1607	saner	1339
1741	sense	1342
1777	cease	1342
305	slave	1344
1421	saucy	1351
212	shire	1352
1478	shone	1360
2172	soapy	1366
481	saint	1371
1729	crane	1378
1704	suite	1381
2270	shine	1382
1530	sooty	1392
1004	share	1393
275	saute	1398
1543	shale	1403
1627	slice	1409
2032	sauce	1411
878	slate	1437

	word	correct	possible	possible_words
205	query	query	1	[query]
698	queer	query	1	[query]
1094	buyer	query	1	[query]
860	azure	query	1	[query]
1359	fiery	query	3	[query, leery, every]

	word	correct	possible
0	cigar	pilot	201
1	rebut	pilot	252
2	sissy	pilot	201
3	humph	pilot	345
4	awake	pilot	2315
...	...	...	...
2310	judge	hello	1055
2311	rower	hello	208
2312	artsy	hello	2315
2313	rural	hello	647
2314	shave	hello	135

	possible
word
canoe	208.0
stair	211.0
rinse	217.0
tenor	217.0
siren	217.0

What's the best first guess in wordle?¶

Letter Position Frequency¶

Most entropy-reducing first guess¶

	word	correct	possible	possible_words
1925	bacon	canoe	2	[canon, bacon]
669	lance	canoe	2	[lance, dance]
1514	decor	canoe	2	[decor, decoy]
1391	carol	canoe	2	[canon, carol]
699	venom	canoe	2	[venom, tenor]
354	manor	canoe	2	[manor, canon]
1771	decoy	canoe	2	[decor, decoy]
2085	clone	canoe	2	[clone, crone]
1828	tenor	canoe	2	[venom, tenor]
1581	dance	canoe	2	[lance, dance]
2243	crone	canoe	2	[clone, crone]
761	clean	canoe	2	[clean, crane]
25	colon	canoe	2	[colon, canon]
112	alone	canoe	3	[alone, atone, anode]
1317	annoy	canoe	3	[manor, canon, annoy]
1899	anode	canoe	3	[alone, atone, anode]
1936	coven	canoe	3	[coven, clone, crone]
369	atone	canoe	3	[alone, atone, anode]
703	acorn	canoe	4	[acorn, ocean, canon, bacon]
1268	mange	canoe	4	[lance, range, mange, dance]
593	chaos	canoe	4	[chaos, canon, carol, cocoa]
331	canny	canoe	4	[canny, candy, canon, canal]
1418	scone	canoe	4	[scone, ounce, clone, crone]
1238	conch	canoe	4	[conic, canon, conch, condo]
1555	cocoa	canoe	4	[chaos, canon, carol, cocoa]
810	range	canoe	4	[lance, range, mange, dance]
824	scion	canoe	4	[colon, scion, canon, bacon]
2285	condo	canoe	4	[conic, canon, conch, condo]
825	candy	canoe	4	[canny, candy, canon, canal]
175	conic	canoe	4	[conic, canon, conch, condo]
1531	canal	canoe	4	[canny, candy, canon, canal]
1304	havoc	canoe	4	[canon, havoc, carol, bacon]
611	carve	canoe	5	[carve, caste, cause, cache, cable]
1337	cache	canoe	5	[carve, caste, cause, cache, cable]
1084	caste	canoe	5	[carve, caste, cause, cache, cable]
1574	mango	canoe	5	[manor, canon, mango, tango, banjo]
373	cacao	canoe	5	[cargo, cacao, canon, carol, cameo]
948	naive	canoe	5	[lance, range, naive, mange, dance]
311	cargo	canoe	5	[cargo, cacao, canon, carol, cameo]
2109	banjo	canoe	5	[manor, canon, mango, tango, banjo]
1288	cause	canoe	5	[carve, caste, cause, cache, cable]
1926	cable	canoe	5	[carve, caste, cause, cache, cable]
2088	tango	canoe	5	[manor, canon, mango, tango, banjo]
1686	cairn	canoe	6	[canny, candy, cabin, canon, canal, cairn]
66	panel	canoe	6	[panel, lance, range, mange, dance, saner]
1607	saner	canoe	6	[panel, lance, range, mange, dance, saner]
979	cabin	canoe	6	[canny, candy, cabin, canon, canal, cairn]
1108	enact	canoe	7	[lance, clean, ocean, enact, dance, crane, pecan]
1786	color	canoe	7	[colon, chaos, crook, canon, carol, cocoa, color]
2082	pecan	canoe	7	[lance, clean, ocean, enact, dance, crane, pecan]

	letter	count
0	a	141
1	b	173
2	c	198
3	d	111
4	e	72

	possible
word
mummy	2315.0
fluff	2315.0
mamma	2315.0
bobby	2315.0
pygmy	2315.0

	word	correct	possible	possible_words
1	rebut	canoe	1055	[rebut, awake, evade, serve, heath, model, gra...
16	quiet	canoe	1055	[rebut, awake, evade, serve, heath, model, gra...
22	fresh	canoe	1055	[rebut, awake, evade, serve, heath, model, gra...
32	helix	canoe	1055	[rebut, awake, evade, serve, heath, model, gra...
37	whelp	canoe	1055	[rebut, awake, evade, serve, heath, model, gra...
53	flesh	canoe	1055	[rebut, awake, evade, serve, heath, model, gra...
58	belly	canoe	1055	[rebut, awake, evade, serve, heath, model, gra...
60	seedy	canoe	1055	[rebut, awake, evade, serve, heath, model, gra...
74	bleed	canoe	1055	[rebut, awake, evade, serve, heath, model, gra...
85	greet	canoe	1055	[rebut, awake, evade, serve, heath, model, gra...
95	islet	canoe	1055	[rebut, awake, evade, serve, heath, model, gra...
114	hyper	canoe	1055	[rebut, awake, evade, serve, heath, model, gra...
121	tweed	canoe	1055	[rebut, awake, evade, serve, heath, model, gra...
125	steed	canoe	1055	[rebut, awake, evade, serve, heath, model, gra...
143	fixer	canoe	1055	[rebut, awake, evade, serve, heath, model, gra...
154	surer	canoe	1055	[rebut, awake, evade, serve, heath, model, gra...
168	exult	canoe	1055	[rebut, awake, evade, serve, heath, model, gra...
169	usher	canoe	1055	[rebut, awake, evade, serve, heath, model, gra...
191	ferry	canoe	1055	[rebut, awake, evade, serve, heath, model, gra...
196	rebus	canoe	1055	[rebut, awake, evade, serve, heath, model, gra...

	letter	count
2	c	477
13	n	575
14	o	754
0	a	979
4	e	1233