From 694ceb10ce382ed854dfbf3ed1ca7d2042ce89f8 Mon Sep 17 00:00:00 2001 From: Lilith <__lilith@discord> Date: Sun, 8 Sep 2024 13:54:42 +1000 Subject: [PATCH] BOM/Codec detection, Python PEP/Formatter pass, small amount of modernisation --- valve_keyvalues_python/keyvalues.py | 154 +++++++++++++++++++--------- 1 file changed, 108 insertions(+), 46 deletions(-) diff --git a/valve_keyvalues_python/keyvalues.py b/valve_keyvalues_python/keyvalues.py index 1cc1d4b..1091b0f 100644 --- a/valve_keyvalues_python/keyvalues.py +++ b/valve_keyvalues_python/keyvalues.py @@ -1,32 +1,48 @@ __author__ = "Jiri Novotny" __version__ = "1.0.0" + +# Source: https://github.com/gorgitko/valve-keyvalues-python + + class KeyValues(dict): """ - Class for manipulation with Valve KeyValue (KV) files (VDF format). Parses the KV file to object with dict interface. + Class for manipulation with Valve KeyValue (KV) files (VDF format). Parses the KV file to object with dict + interface. + Allows to write objects with dict interface to KV files. """ __re = __import__('re') __sys = __import__('sys') + __codecs = __import__('codecs') __OrderedDict = __import__('collections').OrderedDict __regexs = { "key": __re.compile(r"""(['"])(?P((?!\1).)*)\1(?!.)""", __re.I), "key_value": __re.compile(r"""(['"])(?P((?!\1).)*)\1(\s+|)['"](?P((?!\1).)*)\1""", __re.I) } - - def __init__(self, mapper=None, filename=None, encoding="utf-8", mapper_type=__OrderedDict, key_modifier=None, key_sorter=None): + + def __init__(self, mapper=None, filename=None, encoding="utf-8", mapper_type=__OrderedDict, key_modifier=None, + key_sorter=None, ignore_bom=False): """ :param mapper: initialize with own dict-like mapper - :param filename: filename of KV file, which will be parsed to dict structure. Mapper param must not be specified when using this param! + :param filename: filename of KV file, which will be parsed to dict structure. Mapper param must not be specified + when using this param! :param encoding: KV file encoding. Default: 'utf-8' - :param mapper_type: which mapper will be used for storing KV. It must have the dict interface, i.e. allow to do the 'mapper[key] = value action'. - default: 'collections.OrderedDict' - For example you can use the 'dict' type. - :param key_modifier: function for modifying the keys, e.g. the function 'string.lower' will make all the keys lower - :param key_sorter: function for sorting the keys when dumping/writing/str, e.g. using the function 'sorted' will show KV keys in alphabetical order + :param mapper_type: which mapper will be used for storing KV. It must have the dict interface, i.e. allow to do + the 'mapper[key] = value action'. + default: 'collections.OrderedDict' + For example you can use the 'dict' type. + :param key_modifier: function for modifying the keys, e.g. the function 'string.lower' will make all the keys + lower + :param key_sorter: function for sorting the keys when dumping/writing/str, e.g. using the function 'sorted' + will show KV keys in alphabetical order + :param ignore_bom: Manual override flag to ignore the BOM marker at the start of the file and use the specified + encoding. Normal use should never see the user modifying this flag. Does nothing if filename + is not specified. """ + super().__init__() self.__sys.setrecursionlimit(100000) self.mapper_type = type(mapper) if mapper else mapper_type self.key_modifier = key_modifier @@ -40,8 +56,8 @@ def __init__(self, mapper=None, filename=None, encoding="utf-8", mapper_type=__O self.__mapper = mapper return - if type(filename) == str: - self.parse(filename, encoding=encoding) + if isinstance(filename, str): + self.parse(filename, encoding=encoding, ignore_bom=ignore_bom) else: raise Exception("'filename' argument must be string!") @@ -52,7 +68,7 @@ def __getitem__(self, key): return self.__mapper[key] def __repr__(self): - #return repr(self.__mapper) + # return repr(self.__mapper) return self.dump(self.__mapper) def __len__(self): @@ -73,8 +89,9 @@ def copy(self): def has_key(self, k): return self.__mapper.has_key(k) - def pop(self, k, d=None): - return self.__mapper.pop(k, d) + # This function gets redef'd below and never used + # def pop(self, k, d=None): + # return self.__mapper.pop(k, d) def update(self, *args, **kwargs): return self.__mapper.update(*args, **kwargs) @@ -91,8 +108,9 @@ def items(self): def pop(self, *args): return self.__mapper.pop(*args) - def __cmp__(self, dict): - return cmp(self.__mapper, dict) + def __cmp__(self, _dict): + # cmp() does not exist in Py3, but we can just refer to the py3 dict __cmp__ implementation + return super().__cmp__(self.__mapper, _dict) def __contains__(self, item): return item in self.__mapper @@ -101,7 +119,8 @@ def __iter__(self): return iter(self.__mapper) def __unicode__(self): - return unicode(repr(self.__mapper)) + # str is the idiomatic replacement for Py2's 'unicode()' function + return str(repr(self.__mapper)) def __str__(self): return self.dump() @@ -141,20 +160,21 @@ def __parse(self, lines, mapper_type, i=0, key_modifier=None): if lines[i].startswith("{"): if not key: raise Exception("'{{' found without key at line {}".format(i + 1)) - _mapper[key], i = self.__parse(lines, i=i+1, mapper_type=mapper_type, key_modifier=key_modifier) + _mapper[key], i = self.__parse(lines, i=i + 1, mapper_type=mapper_type, key_modifier=key_modifier) continue elif lines[i].startswith("}"): return _mapper, i + 1 elif self.__re.match(self.__regexs["key"], lines[i]): - key = self.__key_modifier(self.__re.search(self.__regexs["key"], lines[i]).group("key"), key_modifier) + key = self.__key_modifier(self.__re.search(self.__regexs["key"], lines[i]).group("key"), + key_modifier) i += 1 continue elif self.__re.match(self.__regexs["key_value"], lines[i]): groups = self.__re.search(self.__regexs["key_value"], lines[i]) _mapper[self.__key_modifier(groups.group("key"), key_modifier)] = groups.group("value") i += 1 - elif self.__re.match(self.__regexs["key_value"], lines[i] + lines[i+1]): - groups = self.__re.search(self.__regexs["key_value"], lines[i] + " " + lines[i+1]) + elif self.__re.match(self.__regexs["key_value"], lines[i] + lines[i + 1]): + groups = self.__re.search(self.__regexs["key_value"], lines[i] + " " + lines[i + 1]) _mapper[self.__key_modifier(groups.group("key"), key_modifier)] = groups.group("value") i += 1 else: @@ -164,26 +184,66 @@ def __parse(self, lines, mapper_type, i=0, key_modifier=None): return _mapper - def parse(self, filename, encoding="utf-8", mapper_type=__OrderedDict, key_modifier=None): + @staticmethod + def _codec_detection(path, default) -> str: + """ + Reads the front of a file to see if a [Byte Order Mark (BOM)](https://en.wikipedia.org/wiki/Byte_order_mark) + exists. If so, it returns the codec that BOM usually corresponds to. A KV file with a BOM at the start that + is improperly stripped will fail to detect the first Key line as the line doesn't start with '"' or an ASCII + char. + + SOURCE: https://stackoverflow.com/a/24370596 + LICENSE: CC BY-SA 4.0 + AUTHOR: ivan_pozdeev + + :param path: The KV File path + :param default: The default chosen encoding (usually UTF8) + :return: A string of the encoding of the file (i.e. 'utf-8-sig', 'utf8', 'utf16', etc...) + """ + with open(path, 'rb') as f: + raw = f.read(4) # will read less if the file is smaller + # BOM_UTF32_LE's start is equal to BOM_UTF16_LE so need to try the former first + for enc, boms in \ + ('utf-8-sig', (KeyValues.__codecs.BOM_UTF8,)), \ + ('utf-32', (KeyValues.__codecs.BOM_UTF32_LE, KeyValues.__codecs.BOM_UTF32_BE)), \ + ('utf-16', (KeyValues.__codecs.BOM_UTF16_LE, KeyValues.__codecs.BOM_UTF16_BE)): + if any(raw.startswith(bom) for bom in boms): + return enc + return default + + def parse(self, filename, encoding="utf-8", mapper_type=__OrderedDict, key_modifier=None, ignore_bom=False): """ Parses the KV file so this instance can be accessed by dict interface. - :param filename: name of KV file - :param encoding: KV file encoding. Default: 'utf-8' - :param mapper_type: which mapper will be used for storing KV. It must have the dict interface, i.e. allow to do the 'mapper[key] = value action'. - default: 'collections.OrderedDict' - For example you can use the 'dict' type. - This will override the instance's 'mapper_type' if specified during instantiation. - :param key_modifier: function for modifying the keys, e.g. the function 'string.lower' will make all the keys lower. - This will override the instance's 'key_modifier' if specified during instantiation. + :param filename: name of the KV file + :param encoding: The encoding of the KV file. Default: 'utf-8'. Some KV files are UTF16-LE encoded. + :param mapper_type: which mapper will be used for storing KV. It must have the dict interface, + i.e. allow to do the 'mapper[key] = value action'. + default: 'collections.OrderedDict' + For example you can use the 'dict' type. This will override the instance's 'mapper_type' + if specified during instantiation. + :param key_modifier: function for modifying the keys, e.g. the function 'string.lower' will make all the keys + lower. This will override the instance's 'key_modifier' if specified during instantiation. + :param ignore_bom: Manual override flag to ignore the BOM marker at the start of the file and use the specified + encoding. Normal use should never see the user modifying this flag. """ - - with open(filename, mode="r", encoding=encoding) as f: - self.__mapper = self.__parse([line.strip() for line in f.readlines()], - mapper_type=mapper_type or self.mapper_type, - key_modifier=key_modifier or self.key_modifier) - - def __tab(self, string, level, quotes=False): + _encoding = encoding + _determined_codec = KeyValues._codec_detection(filename, _encoding) + if _determined_codec != _encoding: + print(f"Warning - The file codec was detected to be {_determined_codec.upper()} but {_encoding} was " + f"provided. Automatically using {_determined_codec.upper()} (pass ignore_bom=True to bypass this).") + if not ignore_bom: + _encoding = _determined_codec + + with open(filename, mode="r", encoding=_encoding) as f: + lines = [line.strip() for line in f.readlines()] + + self.__mapper = self.__parse(lines, + mapper_type=mapper_type or self.mapper_type, + key_modifier=key_modifier or self.key_modifier) + + @staticmethod + def __tab(string, level, quotes=False): if quotes: return '{}"{}"'.format(level * "\t", string) else: @@ -198,13 +258,13 @@ def __dump(self, mapper, key_sorter=None, level=0): keys = mapper.keys() for key in keys: - string += self.__tab(key, level, quotes=True) - if type(mapper[key]) == str: + string += KeyValues.__tab(key, level, quotes=True) + if isinstance(mapper[key], str): string += '\t "{}"\n'.format(mapper[key]) else: - string += "\n" + self.__tab("{\n", level) - string += self.__dump(mapper[key], key_sorter=key_sorter, level=level+1) - string += self.__tab("}\n", level) + string += "\n" + KeyValues.__tab("{\n", level) + string += self.__dump(mapper[key], key_sorter=key_sorter, level=level + 1) + string += KeyValues.__tab("}\n", level) return string @@ -213,8 +273,9 @@ def dump(self, mapper=None, key_sorter=None): Dumps the KeyValues mapper to string. :param mapper: you can dump your own object with dict interface - :param key_sorter: function for sorting the keys when dumping/writing/str, e.g. using the function 'sorted' will show KV in alphabetical order. - This will override the instance's 'key_sorter' if specified during instantiation. + :param key_sorter: function for sorting the keys when dumping/writing/str, e.g. using the function 'sorted' will + show KV in alphabetical order. This will override the instance's 'key_sorter' if specified + during instantiation. :return: string """ @@ -227,8 +288,9 @@ def write(self, filename, encoding="utf-8", mapper=None, key_sorter=None): :param filename: output KV file name :param encoding: output KV file encoding. Default: 'utf-8' :param mapper: you can write your own object with dict interface - :param key_sorter: key_sorter: function for sorting the keys when dumping/writing/str, e.g. using the function 'sorted' will show KV in alphabetical order. - This will override the instance's 'key_sorter' if specified during instantiation. + :param key_sorter: key_sorter: function for sorting the keys when dumping/writing/str, e.g. using the function + 'sorted' will show KV in alphabetical order. + This will override the instance's 'key_sorter' if specified during instantiation. """ with open(filename, mode="w", encoding=encoding) as f: