diff --git a/puz.py b/puz.py index 2f76c52..1278095 100644 --- a/puz.py +++ b/puz.py @@ -37,8 +37,10 @@ MASKSTRING = 'ICHEATED' ENCODING = 'ISO-8859-1' +ENCODING_UTF8 = 'UTF-8' +ENCODING_ERRORS = 'strict' # raises an exception for bad chars; change to 'replace' for laxer handling -ACROSSDOWN = 'ACROSS&DOWN' +ACROSSDOWN = b'ACROSS&DOWN' BLACKSQUARE = '.' BLACKSQUARE2 = ':' @@ -139,6 +141,7 @@ def __init__(self): self.height = 0 self.version = b'1.3' self.fileversion = b'1.3\0' # default + self.encoding = ENCODING # these are bytes that might be unused self.unk1 = b'\0' * 2 self.unk2 = b'\0' * 12 @@ -160,7 +163,7 @@ def load(self, data): # advance to start - files may contain some data before the # start of the puzzle use the ACROSS&DOWN magic string as a waypoint # save the preamble for round-tripping - if not s.seek_to(ACROSSDOWN.encode(ENCODING), -2): + if not s.seek_to(ACROSSDOWN, -2): raise PuzzleFormatError("Data does not appear to represent a " "puzzle. Are you sure you didn't intend " "to use read?") @@ -184,8 +187,12 @@ def load(self, data): self.solution_state = puzzle_data[12] self.version = self.fileversion[:3] - self.solution = s.read(self.width * self.height).decode(ENCODING) - self.fill = s.read(self.width * self.height).decode(ENCODING) + # Once we have fileversion we can guess the encoding + self.encoding = ENCODING if self.version_tuple()[0] < 2 else ENCODING_UTF8 + s.encoding = self.encoding + + self.solution = s.read(self.width * self.height).decode(self.encoding) + self.fill = s.read(self.width * self.height).decode(self.encoding) self.title = s.read_string() self.author = s.read_string() @@ -228,7 +235,7 @@ def save(self, filename): f.write(puzzle_bytes) def tobytes(self): - s = PuzzleBuffer() + s = PuzzleBuffer(encoding=self.encoding) # commit any changes from helpers for h in self.helpers.values(): if 'save' in dir(h): @@ -238,14 +245,14 @@ def tobytes(self): s.write(self.preamble) s.pack(HEADER_FORMAT, - self.global_cksum(), ACROSSDOWN.encode(ENCODING), + self.global_cksum(), ACROSSDOWN, self.header_cksum(), self.magic_cksum(), self.fileversion, self.unk1, self.scrambled_cksum, self.unk2, self.width, self.height, len(self.clues), self.puzzletype, self.solution_state) - s.write(self.solution.encode(ENCODING)) - s.write(self.fill.encode(ENCODING)) + s.write(self.encode(self.solution)) + s.write(self.encode(self.fill)) s.write_string(self.title) s.write_string(self.author) @@ -275,7 +282,7 @@ def tobytes(self): # postscript is initialized, read, and stored as bytes. In case it is # overwritten as a string, this try/except converts it back. try: - postscript_bytes = self.postscript.encode(ENCODING) + postscript_bytes = self.encode(self.postscript) except AttributeError: postscript_bytes = self.postscript @@ -283,8 +290,14 @@ def tobytes(self): return s.tobytes() + def encode(self, s): + return s.encode(self.encoding, ENCODING_ERRORS) + + def encode_zstring(self, s): + return self.encode(s) + b'\0' + def version_tuple(self): - return tuple(map(int, self.version.decode(ENCODING).split('.'))) + return tuple(map(int, self.version.split(b'.'))) def has_rebus(self): return self.rebus().has_rebus() @@ -326,7 +339,7 @@ def lock_solution(self, key): if not self.is_solution_locked(): # set the scrambled bit and cksum self.scrambled_cksum = scrambled_cksum(self.solution, self.width, self.height, - ignore_chars=self.blacksquare()) + ignore_chars=self.blacksquare(), encoding=self.encoding) self.solution_state = SolutionState.Locked scrambled = scramble_solution(self.solution, self.width, self.height, key, ignore_chars=self.blacksquare()) @@ -334,7 +347,8 @@ def lock_solution(self, key): def check_answers(self, fill): if self.is_solution_locked(): - scrambled = scrambled_cksum(fill, self.width, self.height, ignore_chars=self.blacksquare()) + scrambled = scrambled_cksum(fill, self.width, self.height, + ignore_chars=self.blacksquare(), encoding=self.encoding) return scrambled == self.scrambled_cksum else: return fill == self.solution @@ -349,26 +363,26 @@ def text_cksum(self, cksum=0): # null termination, followed by all non-empty clues without null # termination, followed by notes (but only for version >= 1.3) if self.title: - cksum = data_cksum(self.title.encode(ENCODING) + b'\0', cksum) + cksum = data_cksum(self.encode_zstring(self.title), cksum) if self.author: - cksum = data_cksum(self.author.encode(ENCODING) + b'\0', cksum) + cksum = data_cksum(self.encode_zstring(self.author), cksum) if self.copyright: - cksum = data_cksum(self.copyright.encode(ENCODING) + b'\0', cksum) + cksum = data_cksum(self.encode_zstring(self.copyright), cksum) for clue in self.clues: if clue: - cksum = data_cksum(clue.encode(ENCODING), cksum) + cksum = data_cksum(self.encode(clue), cksum) # notes included in global cksum starting v1.3 of format if self.version_tuple() >= (1, 3) and self.notes: - cksum = data_cksum(self.notes.encode(ENCODING) + b'\0', cksum) + cksum = data_cksum(self.encode_zstring(self.notes), cksum) return cksum def global_cksum(self): cksum = self.header_cksum() - cksum = data_cksum(self.solution.encode(ENCODING), cksum) - cksum = data_cksum(self.fill.encode(ENCODING), cksum) + cksum = data_cksum(self.encode(self.solution), cksum) + cksum = data_cksum(self.encode(self.fill), cksum) cksum = self.text_cksum(cksum) # extensions do not seem to be included in global cksum return cksum @@ -376,8 +390,8 @@ def global_cksum(self): def magic_cksum(self): cksums = [ self.header_cksum(), - data_cksum(self.solution.encode(ENCODING)), - data_cksum(self.fill.encode(ENCODING)), + data_cksum(self.encode(self.solution)), + data_cksum(self.encode(self.fill)), self.text_cksum() ] @@ -399,8 +413,9 @@ class PuzzleBuffer: wraps a data buffer ('' or []) and provides .puz-specific methods for reading and writing data """ - def __init__(self, data=None): + def __init__(self, data=None, encoding=ENCODING): self.data = data or [] + self.encoding = encoding self.pos = 0 def can_read(self, n_bytes=1): @@ -425,7 +440,7 @@ def read_string(self): def read_until(self, c): start = self.pos self.seek_to(c, 1) # read past - return str(self.data[start:self.pos-1], ENCODING) + return str(self.data[start:self.pos-1], self.encoding) def seek_to(self, s, offset=0): try: @@ -441,7 +456,7 @@ def write(self, s): def write_string(self, s): s = s or '' - self.data.append(s.encode(ENCODING) + b'\0') + self.data.append(s.encode(self.encoding, ENCODING_ERRORS) + b'\0') def pack(self, struct_format, *values): self.data.append(struct.pack(struct_format, *values)) @@ -534,9 +549,9 @@ def __init__(self, puzzle): rebus_data = self.puzzle.extensions.get(Extensions.Rebus, b'') self.table = parse_bytes(rebus_data) r_sol_data = self.puzzle.extensions.get(Extensions.RebusSolutions, b'') - solutions_str = r_sol_data.decode(ENCODING) + solutions_str = r_sol_data.decode(puzzle.encoding) fill_data = self.puzzle.extensions.get(Extensions.RebusFill, b'') - fill_str = fill_data.decode(ENCODING) + fill_str = fill_data.decode(puzzle.encoding) self.solutions = dict( (int(item[0]), item[1]) for item in parse_dict(solutions_str).items() @@ -573,9 +588,9 @@ def save(self): if self.has_rebus(): # commit changes back to puzzle.extensions self.puzzle.extensions[Extensions.Rebus] = pack_bytes(self.table) - rebus_solutions = dict_to_string(self.solutions).encode(ENCODING) + rebus_solutions = self.puzzle.encode(dict_to_string(self.solutions)) self.puzzle.extensions[Extensions.RebusSolutions] = rebus_solutions - rebus_fill = dict_to_string(self.fill).encode(ENCODING) + rebus_fill = self.puzzle.encode(dict_to_string(self.fill)) self.puzzle.extensions[Extensions.RebusFill] = rebus_fill @@ -669,9 +684,9 @@ def unscramble_string(s, key): return s -def scrambled_cksum(scrambled, width, height, ignore_chars=BLACKSQUARE): +def scrambled_cksum(scrambled, width, height, ignore_chars=BLACKSQUARE, encoding=ENCODING): data = replace_chars(square(scrambled, width, height), ignore_chars) - return data_cksum(data.encode(ENCODING)) + return data_cksum(data.encode(encoding, ENCODING_ERRORS)) def key_digits(key): diff --git a/testfiles/unicode.puz b/testfiles/unicode.puz new file mode 100644 index 0000000..e7b5c3f Binary files /dev/null and b/testfiles/unicode.puz differ diff --git a/tests.py b/tests.py index 16bf395..b8df0ff 100755 --- a/tests.py +++ b/tests.py @@ -84,6 +84,23 @@ def test_v1_4(self): p = puz.read('testfiles/nyt_v1_4.puz') p.tobytes() + def test_v2_unicode(self): + p = puz.read('testfiles/unicode.puz') + # puzzle title contains emoji + self.assertEqual(p.title, u'\u2694\ufe0f') + self.assertEqual(p.encoding, 'UTF-8') + p.tobytes() + + def test_v2_upgrade(self): + p = puz.read('testfiles/washpost.puz') + p.title = u'\u2694\ufe0f' + p.version = b'2.0' + p.fileversion = b'2.0\0' + p.encoding = puz.ENCODING_UTF8 + data = p.tobytes() + p2 = puz.load(data) + self.assertEqual(p2.title, u'\u2694\ufe0f') + def test_save_empty_puzzle(self): ''' confirm an empty Puzzle() can be saved to a file ''' p = puz.Puzzle()