Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add support for reading/writing utf-8 with v2 files #26

Merged
merged 2 commits into from
Jun 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 45 additions & 30 deletions puz.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,10 @@
MASKSTRING = 'ICHEATED'

ENCODING = 'ISO-8859-1'
ENCODING_UTF8 = 'UTF-8'
ENCODING_ERRORS = 'strict' # raises an exception for bad chars; change to 'replace' for laxer handling

ACROSSDOWN = 'ACROSS&DOWN'
ACROSSDOWN = b'ACROSS&DOWN'

BLACKSQUARE = '.'
BLACKSQUARE2 = ':'
Expand Down Expand Up @@ -139,6 +141,7 @@ def __init__(self):
self.height = 0
self.version = b'1.3'
self.fileversion = b'1.3\0' # default
self.encoding = ENCODING
# these are bytes that might be unused
self.unk1 = b'\0' * 2
self.unk2 = b'\0' * 12
Expand All @@ -160,7 +163,7 @@ def load(self, data):
# advance to start - files may contain some data before the
# start of the puzzle use the ACROSS&DOWN magic string as a waypoint
# save the preamble for round-tripping
if not s.seek_to(ACROSSDOWN.encode(ENCODING), -2):
if not s.seek_to(ACROSSDOWN, -2):
raise PuzzleFormatError("Data does not appear to represent a "
"puzzle. Are you sure you didn't intend "
"to use read?")
Expand All @@ -184,8 +187,12 @@ def load(self, data):
self.solution_state = puzzle_data[12]

self.version = self.fileversion[:3]
self.solution = s.read(self.width * self.height).decode(ENCODING)
self.fill = s.read(self.width * self.height).decode(ENCODING)
# Once we have fileversion we can guess the encoding
self.encoding = ENCODING if self.version_tuple()[0] < 2 else ENCODING_UTF8
s.encoding = self.encoding

self.solution = s.read(self.width * self.height).decode(self.encoding)
self.fill = s.read(self.width * self.height).decode(self.encoding)

self.title = s.read_string()
self.author = s.read_string()
Expand Down Expand Up @@ -228,7 +235,7 @@ def save(self, filename):
f.write(puzzle_bytes)

def tobytes(self):
s = PuzzleBuffer()
s = PuzzleBuffer(encoding=self.encoding)
# commit any changes from helpers
for h in self.helpers.values():
if 'save' in dir(h):
Expand All @@ -238,14 +245,14 @@ def tobytes(self):
s.write(self.preamble)

s.pack(HEADER_FORMAT,
self.global_cksum(), ACROSSDOWN.encode(ENCODING),
self.global_cksum(), ACROSSDOWN,
self.header_cksum(), self.magic_cksum(),
self.fileversion, self.unk1, self.scrambled_cksum,
self.unk2, self.width, self.height,
len(self.clues), self.puzzletype, self.solution_state)

s.write(self.solution.encode(ENCODING))
s.write(self.fill.encode(ENCODING))
s.write(self.encode(self.solution))
s.write(self.encode(self.fill))

s.write_string(self.title)
s.write_string(self.author)
Expand Down Expand Up @@ -275,16 +282,22 @@ def tobytes(self):
# postscript is initialized, read, and stored as bytes. In case it is
# overwritten as a string, this try/except converts it back.
try:
postscript_bytes = self.postscript.encode(ENCODING)
postscript_bytes = self.encode(self.postscript)
except AttributeError:
postscript_bytes = self.postscript

s.write(postscript_bytes)

return s.tobytes()

def encode(self, s):
return s.encode(self.encoding, ENCODING_ERRORS)

def encode_zstring(self, s):
return self.encode(s) + b'\0'

def version_tuple(self):
return tuple(map(int, self.version.decode(ENCODING).split('.')))
return tuple(map(int, self.version.split(b'.')))

def has_rebus(self):
return self.rebus().has_rebus()
Expand Down Expand Up @@ -326,15 +339,16 @@ def lock_solution(self, key):
if not self.is_solution_locked():
# set the scrambled bit and cksum
self.scrambled_cksum = scrambled_cksum(self.solution, self.width, self.height,
ignore_chars=self.blacksquare())
ignore_chars=self.blacksquare(), encoding=self.encoding)
self.solution_state = SolutionState.Locked
scrambled = scramble_solution(self.solution, self.width, self.height, key,
ignore_chars=self.blacksquare())
self.solution = scrambled

def check_answers(self, fill):
if self.is_solution_locked():
scrambled = scrambled_cksum(fill, self.width, self.height, ignore_chars=self.blacksquare())
scrambled = scrambled_cksum(fill, self.width, self.height,
ignore_chars=self.blacksquare(), encoding=self.encoding)
return scrambled == self.scrambled_cksum
else:
return fill == self.solution
Expand All @@ -349,35 +363,35 @@ def text_cksum(self, cksum=0):
# null termination, followed by all non-empty clues without null
# termination, followed by notes (but only for version >= 1.3)
if self.title:
cksum = data_cksum(self.title.encode(ENCODING) + b'\0', cksum)
cksum = data_cksum(self.encode_zstring(self.title), cksum)
if self.author:
cksum = data_cksum(self.author.encode(ENCODING) + b'\0', cksum)
cksum = data_cksum(self.encode_zstring(self.author), cksum)
if self.copyright:
cksum = data_cksum(self.copyright.encode(ENCODING) + b'\0', cksum)
cksum = data_cksum(self.encode_zstring(self.copyright), cksum)

for clue in self.clues:
if clue:
cksum = data_cksum(clue.encode(ENCODING), cksum)
cksum = data_cksum(self.encode(clue), cksum)

# notes included in global cksum starting v1.3 of format
if self.version_tuple() >= (1, 3) and self.notes:
cksum = data_cksum(self.notes.encode(ENCODING) + b'\0', cksum)
cksum = data_cksum(self.encode_zstring(self.notes), cksum)

return cksum

def global_cksum(self):
cksum = self.header_cksum()
cksum = data_cksum(self.solution.encode(ENCODING), cksum)
cksum = data_cksum(self.fill.encode(ENCODING), cksum)
cksum = data_cksum(self.encode(self.solution), cksum)
cksum = data_cksum(self.encode(self.fill), cksum)
cksum = self.text_cksum(cksum)
# extensions do not seem to be included in global cksum
return cksum

def magic_cksum(self):
cksums = [
self.header_cksum(),
data_cksum(self.solution.encode(ENCODING)),
data_cksum(self.fill.encode(ENCODING)),
data_cksum(self.encode(self.solution)),
data_cksum(self.encode(self.fill)),
self.text_cksum()
]

Expand All @@ -399,8 +413,9 @@ class PuzzleBuffer:
wraps a data buffer ('' or []) and provides .puz-specific methods for
reading and writing data
"""
def __init__(self, data=None):
def __init__(self, data=None, encoding=ENCODING):
self.data = data or []
self.encoding = encoding
self.pos = 0

def can_read(self, n_bytes=1):
Expand All @@ -425,7 +440,7 @@ def read_string(self):
def read_until(self, c):
start = self.pos
self.seek_to(c, 1) # read past
return str(self.data[start:self.pos-1], ENCODING)
return str(self.data[start:self.pos-1], self.encoding)

def seek_to(self, s, offset=0):
try:
Expand All @@ -441,7 +456,7 @@ def write(self, s):

def write_string(self, s):
s = s or ''
self.data.append(s.encode(ENCODING) + b'\0')
self.data.append(s.encode(self.encoding, ENCODING_ERRORS) + b'\0')

def pack(self, struct_format, *values):
self.data.append(struct.pack(struct_format, *values))
Expand Down Expand Up @@ -534,9 +549,9 @@ def __init__(self, puzzle):
rebus_data = self.puzzle.extensions.get(Extensions.Rebus, b'')
self.table = parse_bytes(rebus_data)
r_sol_data = self.puzzle.extensions.get(Extensions.RebusSolutions, b'')
solutions_str = r_sol_data.decode(ENCODING)
solutions_str = r_sol_data.decode(puzzle.encoding)
fill_data = self.puzzle.extensions.get(Extensions.RebusFill, b'')
fill_str = fill_data.decode(ENCODING)
fill_str = fill_data.decode(puzzle.encoding)
self.solutions = dict(
(int(item[0]), item[1])
for item in parse_dict(solutions_str).items()
Expand Down Expand Up @@ -573,9 +588,9 @@ def save(self):
if self.has_rebus():
# commit changes back to puzzle.extensions
self.puzzle.extensions[Extensions.Rebus] = pack_bytes(self.table)
rebus_solutions = dict_to_string(self.solutions).encode(ENCODING)
rebus_solutions = self.puzzle.encode(dict_to_string(self.solutions))
self.puzzle.extensions[Extensions.RebusSolutions] = rebus_solutions
rebus_fill = dict_to_string(self.fill).encode(ENCODING)
rebus_fill = self.puzzle.encode(dict_to_string(self.fill))
self.puzzle.extensions[Extensions.RebusFill] = rebus_fill


Expand Down Expand Up @@ -669,9 +684,9 @@ def unscramble_string(s, key):
return s


def scrambled_cksum(scrambled, width, height, ignore_chars=BLACKSQUARE):
def scrambled_cksum(scrambled, width, height, ignore_chars=BLACKSQUARE, encoding=ENCODING):
data = replace_chars(square(scrambled, width, height), ignore_chars)
return data_cksum(data.encode(ENCODING))
return data_cksum(data.encode(encoding, ENCODING_ERRORS))


def key_digits(key):
Expand Down
Binary file added testfiles/unicode.puz
Binary file not shown.
17 changes: 17 additions & 0 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,23 @@ def test_v1_4(self):
p = puz.read('testfiles/nyt_v1_4.puz')
p.tobytes()

def test_v2_unicode(self):
p = puz.read('testfiles/unicode.puz')
# puzzle title contains emoji
self.assertEqual(p.title, u'\u2694\ufe0f')
self.assertEqual(p.encoding, 'UTF-8')
p.tobytes()

def test_v2_upgrade(self):
p = puz.read('testfiles/washpost.puz')
p.title = u'\u2694\ufe0f'
p.version = b'2.0'
p.fileversion = b'2.0\0'
p.encoding = puz.ENCODING_UTF8
data = p.tobytes()
p2 = puz.load(data)
self.assertEqual(p2.title, u'\u2694\ufe0f')

def test_save_empty_puzzle(self):
''' confirm an empty Puzzle() can be saved to a file '''
p = puz.Puzzle()
Expand Down