-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwordle_utils.pyx
319 lines (261 loc) · 16 KB
/
wordle_utils.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#cython: language_level=3
"""Utility functions used by the other Wordle code.
Wordle is at https://www.powerlanguage.co.uk/wordle/.
This program comes with ABSOLUTELY NO WARRANTY. Use at your own risk. It is
copyright 2022 by Patrick Mooney. It is free software, and you are welcome to
redistribute it under certain conditions, according to the GNU general public
license, either version 3 or (at your own option) any later version. See the
file LICENSE.md for details.
"""
import collections
import heapq
import numbers
import re
import reprlib
import string
import typing
import unicodedata
from pathlib import Path
# word_list_file = Path('/home/patrick/Documents/programming/resources/word-lists/wordle.list')
word_list_file = Path('/home/patrick/Documents/programming/resources/word-lists/dwyl/words_alpha.txt')
non_words_file = Path('non-words.txt') # List of words from the large word list we want to exclude from consideration.
addl_words_file = Path('addl-words.txt') # Words added manually, in addition to the main word list.
conf_words_file = Path('confirmed-words.txt') # Words we have manually confirmed are definitely in the 'legal' set
if not non_words_file.exists():
non_words_file.write_text("", encoding='utf-8')
if not addl_words_file.exists():
addl_words_file.write_text("", encoding='utf-8')
if not conf_words_file.exists():
conf_words_file.write_text("", encoding='utf-8')
non_words = [w.strip().casefold() for w in non_words_file.read_text(encoding='utf-8').split('\n') if w.strip()]
addl_words = [w.strip().casefold() for w in addl_words_file.read_text(encoding='utf-8').split('\n') if w.strip()]
conf_words = [w.strip().casefold() for w in conf_words_file.read_text(encoding='utf-8').split('\n') if w.strip()]
main_english_words = [w.strip().casefold() for w in word_list_file.read_text(encoding='utf-8').split('\n') if w.strip()]
known_five_letter_words = {w for w in (main_english_words + addl_words) if ((len(w) == 5) and (w not in non_words))}
word_list_text = '\n'.join(sorted(known_five_letter_words)) # these are the only words we're ever interested in
SolutionData = collections.namedtuple('SolutionData', ('Move',
'InitialPossibleLetters',
'KnownLettersWithUnknownPositionBeforeGuess',
'PossibleLettersAfterGuess',
'KnownLettersWithUnknownPositionAfterGuess',
'RemainingPossibilities',
'ExhaustedErroneously',
'Solved'), module=__name__) # Must specify module name to make NamedTuple pickleable: https://stackoverflow.com/questions/55224383/pickling-of-a-namedtuple-instance-succeeds-normally-but-fails-when-module-is-cy
# Some functions to help with normalization of text.
def strip_accents(s: str) -> str:
return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
def normalize_char_string(s: str) -> str:
return strip_accents(''.join([c for c in s.lower().strip() if c.isalpha()]))
# Helper functions for the Javier Frias functions, below.
def positional_population_counts(words: typing.Iterable[str]) -> typing.Dict[int, collections.Counter]:
"""Given WORDS, a list of five-letter words, return a dictionary mapping the
position numbers 1 ... 5 to a Counter showing counts of letters in that
position.
For instance, given ['arose', 'until'], the result is (some not-necessarily-same-
ordered version of):
{ 1: Counter({'a': 1, 'u': 1}),
2: Counter({'r': 1, 'n': 1}),
3: Counter({'o': 1, 't': 1}),
4: Counter({'s': 1, 'i': 1}),
5: Counter({'e': 1, 'l': 1}) }
"""
for i in words: assert len(i) == 5
return {i+1: collections.Counter([w[i] for w in words]) for i in range(5)}
# Two functions that are used to help evaluate criteria picked out by Javier Frias as part of his evaluative criteria.
# See "FLOWS" at https://betterprogramming.pub/forget-luck-optimized-wordle-strategy-using-bigquery-c676771e316f
# As of this writing (19 May 2022), nothing yet uses these functions.
def positional_letter_weight(letter: str,
position_in_word: int,
sample_population: typing.Dict[int, collections.Counter] = positional_population_counts(known_five_letter_words),
) -> int:
"""The positional letter weight of a letter, LETTER, is how often that letter
appears as the POSITION_IN_WORDth letter in the words collected together in
SAMPLE_POPULATION.
Really this is just a gateway for indexing SAMPLE_POPULATION by position and
letter, doing some basic sanity checks along the way.
Frias really only discusses this in relation to picking initial words, which is
why the SAMPLE_POPULATION defaults to "all known words," but it may be useful to
help score later guesses, too.
"""
assert len(letter) == 1
assert letter in string.ascii_lowercase
assert isinstance(position_in_word, int)
assert 1 <= position_in_word <= 5
return sample_population[position_in_word][letter]
def word_weight(word: str,
sample_population: typing.Dict[int, collections.Counter] = positional_population_counts(known_five_letter_words),
) -> int:
"""The WORD_WEIGHT of a word, WORD, is the sum of the positional letter weights of
each letter in the word in their respective positions, relative to the
SAMPLE_POPULATION of words being considered. As with positional_letter_weight,
above, it defaults to considering a population of "all Wordle words" because
that's the context in which Frias gives it the most attention.
"""
assert len(word) == 5
for c in word: assert c in string.ascii_lowercase
# Functions that are used directly by other code to help manage the changing set of parameters as guesses are tried by
# various strategies.
def enumerate_solutions(possible: typing.Dict[int, str],
ambiguous_pos: str) -> typing.Tuple[collections.Counter, typing.Set[str]]:
"""Given POSSIBLE and AMBIGUOUS_POS, generate a list of words in the known words
list that might be the word we're looking for, i.e. don't violate the currently
known constraints.
POSSIBLE is a dictionary mapping positions 1, 2, 3, 4, 5 to a string containing
letters that might be in that position, AMBIGUOUS_POS is a string containing
letters we know to be in the word but for which we haven't yet finalized the
placement.
Does not make any effort to rank the possible answers yet.
Along the way, generates a Counter that maps letters in the words that might be
the answer we're looking for to how often those letters occur in the possibility
set.
Returns a Set of maybe-words and the Counter mapping the letters in those words
to their frequency in the maybe-words.
"""
regex = ''.join([f"[{''.join(sorted(i))}]" for i in possible.values()]) + "\n"
possible_answers = [ans.strip() for ans in re.findall(regex, word_list_text) if all(c in ans for c in ambiguous_pos)]
letter_frequencies = collections.Counter(''.join(possible_answers))
return letter_frequencies, possible_answers
def untried_word_score(letter_frequencies: collections.Counter,
untried_letters: str,
w: str) -> int:
"""Produce a score for W, a word that has not yet been attempted. The score depends
on how often each letter in W that hasn't yet been tried appears in the sample
of five-letter words. This count is pre-computed and stored in the
LETTER_FREQUENCIES variable. Letters that have already been tried and are known
to occur in the word we're trying to guess contribute nothing to the calculated
score: the score is designed to favor getting more information from the next
guess, not to increase the likelihood of making the next guess the correct one.
(We're trying to minimize the overall number of guesses, not to win on the next
guess.) Repeated letters, after the first occurrence, also do not count toward
the word's overall score: they don't elicit new information, either. "As many
new letters as possible" is further incentivized by multiplying the derived
score-sum by the number of unique letters in the word to derive the final score.
"""
return sum([letter_frequencies[c] for i, c in enumerate(w) if ((c in untried_letters) and (c not in w[:i]))]) * len(set(w))
def ranked_answers(possible_answers: typing.List[str],
letter_frequencies: collections.Counter,
untried_letters: str):
def ranker(w) -> int:
return untried_word_score(letter_frequencies, untried_letters, w)
return sorted(possible_answers, key=ranker, reverse=True)
def evaluate_guess(guess: str,
answer: str,
possibilities: typing.Dict[int, set],
unknown_pos: str) -> typing.Tuple[bool, str, typing.Dict[int, str]]:
"""Given a guess, evaluate whether that guess is correct, and update the known
constraints based on the information that the guess reveals.
GUESS is of course the current guess, and ANSWER is the answer. POSSIBILITIES is
a dictionary mapping position number ( [1, 6) ) to a set of letters that might
be in that position. UNKNOWN_POS is a string containing characters that are
known to be in the answer, though we don't yet know where.
POSSIBILITIES is immediately duplicated and the duplicate is modified and
returned to avoid modifying the original. Currently (16 May 2022), nothing
depends on the original being unmodified, but that may not stay true forever.
The return value is a 3-tuple: the first value is True iff GUESS is the correct
ANSWER, or False otherwise. The second value is the revised list of characters
that are known to be in the answer but whose position there is unknown. The
third value is the revised POSSIBILITIES dict, mapping (1-based) position
numbers onto a set of chars that might be in that position.
"""
revised_poss = dict(possibilities)
assert len(guess) == len(answer) == 5, f"ERROR! Somehow, we're dealing with a word ({guess.upper()} / {answer.upper()}) that doesn't have 5 letters!"
for pos, (guess_c, ans_c) in enumerate(zip(guess, answer)):
if guess_c == ans_c: # green tile: this is the correct letter for this position
revised_poss[1+pos] = set(guess_c)
unknown_pos = ''.join([c for c in unknown_pos if c != guess_c])
elif guess_c in answer: # yellow tile: letter appears in word, but not in this position
unknown_pos += guess_c
revised_poss[1+pos].discard(guess_c)
else: # gray tile: letter does not appear in word at all
revised_poss = {key: value - {guess_c} for key, value in revised_poss.items()}
unknown_pos = ''.join([c for c in unknown_pos if c != guess_c])
unknown_pos = ''.join(sorted(set(unknown_pos)))
return ((guess == answer), unknown_pos, revised_poss)
class FairLimitedHeap(collections.abc.Iterable):
"""A heap implemented using the heapq module that has a limited size. The size
limit is a "soft" or "fair" limit: it tries to keep SOFT_LIMIT items (if at
least that many items have been added to the heap in the first place), but will
keep more if the worst-scored items all score equally poorly. So, for instance,
pushing a single new item onto a heap that has exactly SOFT_LIMIT items will not
delete a single item from the bottom of the heap if multiple items at the bottom
of the heap all have the same low score.
For instance, if there is a heap of 100 items, and the soft limit is 100, and
the four lowest-ranked items all have the same low score, then:
* Pushing an item scoring above that low score will increase the size of the
heap to 101, because none of the items with the low score has a score any
lower than any of the others, so none of them is removed; and
* pushing another item scoring above that low score will increase the size of
the heap to 102, for that same reason; same with pushing another item with a
score above that low score, which will increase the heap size to 103; and
* pushing a fourth item above that low score will add the item to the heap,
temporarily increasing the size of the heap to 104 before removing all four of
the equally low-scoring items and taking the heap size down to 100.
The score of items must be manually specified when adding an item to the heap.
Items added to the heap must be orderable for this to work. Internally, the heap
is a list of (score, item) tuples, and rely on Python's short-circuiting of
tuple comparisons to keep the heap sorted. (The fact that items with identical
scores will be ordered based on the ordering of the items being stored both
implies that the heap does not preserve insertion order for same-scored items
and is the reason why items scored must be orderable.)
"""
def __init__(self, soft_limit: int = 100,
initial: typing.Iterable = ()) -> None:
"""If INITIAL is specified, it must be an iterable of (value, item) tuples.
"""
if initial:
for item, _ in initial:
assert isinstance(item, numbers.Number)
self._soft_limit = soft_limit
self._data = [][:]
for item, value in initial:
self.push(value, item)
def __str__(self) -> str:
return f"< {self.__class__.__name__} object, with {len(self._data)} items having priorities from {self._data[0][0]} to {self._data[-1][0]} >"
def __repr__(self) -> str:
"""Limit how many items are displayed in the __repr__ by using reprlib.
"""
return f"{self.__class__.__name__} ({reprlib.repr(self._data)})"
def __eq__(self, other) -> bool:
if type(self) != type(other):
return False
return self._data == other._data
def __bool__(self) -> bool:
return bool(self._data)
def __iter__(self) -> typing.Iterator:
return iter(self._data)
def __len__(self) -> int:
return len(self._data)
def __getitem__(self, item) -> typing.Any:
return self._data[item][1] # return only the item's VALUE, not its SCORE.
def __setitem__(self, key, newvalue):
raise TypeError(f"Cannot directly set individual items in a {self.__class__.__name__}! Use .push() to add a value to the heap instead.")
def push(self, item: typing.Any,
value: numbers.Number) -> None:
heapq.heappush(self._data, (value, item))
if len(self._data) <= self._soft_limit:
return
# Find the index of the item after last item in the heap with same VALUE as the item at the low end of the heap
for index, item in enumerate(self._data):
if index == 0:
comp_value = item[0]
elif item[0] != comp_value: # exact equality good enough, even if difference is tiny.
break
if (len(self._data) - index) >= self._soft_limit:
for i in range(index):
heapq.heappop(self._data)
def pop(self, also_return_score: bool = False) -> typing.Any:
if also_return_score:
return heapq.heappop(self._data)
else:
return heapq.heappop(self._data[1])
def as_sorted_list(self, include_values: bool = False) -> typing.List[typing.Tuple[numbers.Number, typing.Any]]:
"""Returns the contents of the heap in sorted-by-priority order. If
INCLUDE_VALUES is True (not the default), returns a set of (value, item)
tuples instead of just the items.
"""
if include_values:
return sorted(self._data)
else:
return [i[1] for i in sorted(self._data)]