From dea9526835fdf4aa90ed341c0407a6576f2f1ce1 Mon Sep 17 00:00:00 2001 From: Josh Sherman Date: Thu, 14 Nov 2013 10:01:14 -0500 Subject: [PATCH 01/41] Added --size argument to filter keys by size --- rdbtools/cli/rdb.py | 6 ++++-- rdbtools/memprofiler.py | 8 +++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/rdbtools/cli/rdb.py b/rdbtools/cli/rdb.py index 348102f..6995ccc 100755 --- a/rdbtools/cli/rdb.py +++ b/rdbtools/cli/rdb.py @@ -22,6 +22,8 @@ def main(): parser.add_option("-t", "--type", dest="types", action="append", help="""Data types to include. Possible values are string, hash, set, sortedset, list. Multiple typees can be provided. If not specified, all data types will be returned""") + parser.add_option("-s", "--size", dest="size", default=None, + help="Limit memory output to keys greater to or equal to this value (in bytes)") (options, args) = parser.parse_args() @@ -57,7 +59,7 @@ def main(): elif 'json' == options.command: callback = JSONCallback(f) elif 'memory' == options.command: - reporter = PrintAllKeys(f) + reporter = PrintAllKeys(f, options.size) callback = MemoryCallback(reporter, 64) elif 'protocol' == options.command: callback = ProtocolCallback(f) @@ -71,7 +73,7 @@ def main(): elif 'json' == options.command: callback = JSONCallback(sys.stdout) elif 'memory' == options.command: - reporter = PrintAllKeys(sys.stdout) + reporter = PrintAllKeys(sys.stdout, options.size) callback = MemoryCallback(reporter, 64) elif 'protocol' == options.command: callback = ProtocolCallback(sys.stdout) diff --git a/rdbtools/memprofiler.py b/rdbtools/memprofiler.py index 3a80ba2..73aeac8 100644 --- a/rdbtools/memprofiler.py +++ b/rdbtools/memprofiler.py @@ -68,14 +68,16 @@ def get_json(self): return json.dumps({"aggregates":self.aggregates, "scatters":self.scatters, "histograms":self.histograms}) class PrintAllKeys(): - def __init__(self, out): + def __init__(self, out, size): + self._size = int(size) self._out = out self._out.write("%s,%s,%s,%s,%s,%s,%s\n" % ("database", "type", "key", "size_in_bytes", "encoding", "num_elements", "len_largest_element")) def next_record(self, record) : - self._out.write("%d,%s,%s,%d,%s,%d,%d\n" % (record.database, record.type, encode_key(record.key), - record.bytes, record.encoding, record.size, record.len_largest_element)) + if self._size is None or record.size >= self._size: + self._out.write("%d,%s,%s,%d,%s,%d,%d\n" % (record.database, record.type, encode_key(record.key), + record.bytes, record.encoding, record.size, record.len_largest_element)) class MemoryCallback(RdbCallback): '''Calculates the memory used if this rdb file were loaded into RAM From a2db5d28f56c00bb200ffdc630fa8dc5d8bb3c36 Mon Sep 17 00:00:00 2001 From: Josh Sherman Date: Thu, 14 Nov 2013 22:04:24 -0500 Subject: [PATCH 02/41] Fixed bug when no --size is present Was attempting to apply int() to a None value. --- rdbtools/memprofiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rdbtools/memprofiler.py b/rdbtools/memprofiler.py index 73aeac8..2207517 100644 --- a/rdbtools/memprofiler.py +++ b/rdbtools/memprofiler.py @@ -69,13 +69,13 @@ def get_json(self): class PrintAllKeys(): def __init__(self, out, size): - self._size = int(size) + self._size = size self._out = out self._out.write("%s,%s,%s,%s,%s,%s,%s\n" % ("database", "type", "key", "size_in_bytes", "encoding", "num_elements", "len_largest_element")) def next_record(self, record) : - if self._size is None or record.size >= self._size: + if self._size is None or record.size >= int(self._size): self._out.write("%d,%s,%s,%d,%s,%d,%d\n" % (record.database, record.type, encode_key(record.key), record.bytes, record.encoding, record.size, record.len_largest_element)) From b2e2d5e88f7497cb9651bd52076694f78ae26ef0 Mon Sep 17 00:00:00 2001 From: Josh Sherman Date: Thu, 14 Nov 2013 23:09:42 -0500 Subject: [PATCH 03/41] Added filtering of the top X largest keys --- rdbtools/cli/rdb.py | 6 ++++-- rdbtools/memprofiler.py | 29 ++++++++++++++++++++++++----- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/rdbtools/cli/rdb.py b/rdbtools/cli/rdb.py index 6995ccc..60093a2 100755 --- a/rdbtools/cli/rdb.py +++ b/rdbtools/cli/rdb.py @@ -24,6 +24,8 @@ def main(): If not specified, all data types will be returned""") parser.add_option("-s", "--size", dest="size", default=None, help="Limit memory output to keys greater to or equal to this value (in bytes)") + parser.add_option("-l", "--largest", dest="largest", default=None, + help="Limit memory output to only the top N keys (by size)") (options, args) = parser.parse_args() @@ -59,7 +61,7 @@ def main(): elif 'json' == options.command: callback = JSONCallback(f) elif 'memory' == options.command: - reporter = PrintAllKeys(f, options.size) + reporter = PrintAllKeys(f, options.size, options.largest) callback = MemoryCallback(reporter, 64) elif 'protocol' == options.command: callback = ProtocolCallback(f) @@ -73,7 +75,7 @@ def main(): elif 'json' == options.command: callback = JSONCallback(sys.stdout) elif 'memory' == options.command: - reporter = PrintAllKeys(sys.stdout, options.size) + reporter = PrintAllKeys(sys.stdout, options.size, options.largest) callback = MemoryCallback(reporter, 64) elif 'protocol' == options.command: callback = ProtocolCallback(sys.stdout) diff --git a/rdbtools/memprofiler.py b/rdbtools/memprofiler.py index 2207517..9e92501 100644 --- a/rdbtools/memprofiler.py +++ b/rdbtools/memprofiler.py @@ -5,6 +5,8 @@ from rdbtools.parser import RdbCallback from rdbtools.callbacks import encode_key +from heapq import heappush, nlargest, heappop + ZSKIPLIST_MAXLEVEL=32 ZSKIPLIST_P=0.25 REDIS_SHARED_INTEGERS = 10000 @@ -68,16 +70,32 @@ def get_json(self): return json.dumps({"aggregates":self.aggregates, "scatters":self.scatters, "histograms":self.histograms}) class PrintAllKeys(): - def __init__(self, out, size): + def __init__(self, out, size, largest): self._size = size + self._largest = largest self._out = out self._out.write("%s,%s,%s,%s,%s,%s,%s\n" % ("database", "type", "key", "size_in_bytes", "encoding", "num_elements", "len_largest_element")) + + if self._largest is not None: + self._heap = [] def next_record(self, record) : - if self._size is None or record.size >= int(self._size): - self._out.write("%d,%s,%s,%d,%s,%d,%d\n" % (record.database, record.type, encode_key(record.key), - record.bytes, record.encoding, record.size, record.len_largest_element)) + if self._largest is None: + if self._size is None or record.bytes >= int(self._size): + self._out.write("%d,%s,%s,%d,%s,%d,%d\n" % (record.database, record.type, encode_key(record.key), + record.bytes, record.encoding, record.size, record.len_largest_element)) + else: + heappush(self._heap, (record.bytes, record)) + self._heap = nlargest(int(self._largest), self._heap) + + def dump_heap(self): + if self._largest is not None: + self._largest = None + + while self._heap: + bytes, record = heappop(self._heap) + self.next_record(record) class MemoryCallback(RdbCallback): '''Calculates the memory used if this rdb file were loaded into RAM @@ -106,8 +124,9 @@ def end_database(self, db_number): pass def end_rdb(self): + self._stream.dump_heap() pass - + def set(self, key, value, expiry, info): self._current_encoding = info['encoding'] size = self.sizeof_string(key) + self.sizeof_string(value) + self.top_level_object_overhead() From 69d2d771fe08816980ee10e2ab34649dc3185332 Mon Sep 17 00:00:00 2001 From: Josh Sherman Date: Thu, 14 Nov 2013 23:14:39 -0500 Subject: [PATCH 04/41] Fixed a bug, renamed the argument Was using record.size instead of record.bytes. Also renamed the arguments to -b / --bytes to leave -s / --size open for perhaps another filter on the size of the key (number of elements) --- rdbtools/cli/rdb.py | 6 +++--- rdbtools/memprofiler.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/rdbtools/cli/rdb.py b/rdbtools/cli/rdb.py index 6995ccc..476702a 100755 --- a/rdbtools/cli/rdb.py +++ b/rdbtools/cli/rdb.py @@ -22,7 +22,7 @@ def main(): parser.add_option("-t", "--type", dest="types", action="append", help="""Data types to include. Possible values are string, hash, set, sortedset, list. Multiple typees can be provided. If not specified, all data types will be returned""") - parser.add_option("-s", "--size", dest="size", default=None, + parser.add_option("-b", "--bytes", dest="bytes", default=None, help="Limit memory output to keys greater to or equal to this value (in bytes)") (options, args) = parser.parse_args() @@ -59,7 +59,7 @@ def main(): elif 'json' == options.command: callback = JSONCallback(f) elif 'memory' == options.command: - reporter = PrintAllKeys(f, options.size) + reporter = PrintAllKeys(f, options.bytes) callback = MemoryCallback(reporter, 64) elif 'protocol' == options.command: callback = ProtocolCallback(f) @@ -73,7 +73,7 @@ def main(): elif 'json' == options.command: callback = JSONCallback(sys.stdout) elif 'memory' == options.command: - reporter = PrintAllKeys(sys.stdout, options.size) + reporter = PrintAllKeys(sys.stdout, options.bytes) callback = MemoryCallback(reporter, 64) elif 'protocol' == options.command: callback = ProtocolCallback(sys.stdout) diff --git a/rdbtools/memprofiler.py b/rdbtools/memprofiler.py index 2207517..277320b 100644 --- a/rdbtools/memprofiler.py +++ b/rdbtools/memprofiler.py @@ -68,14 +68,14 @@ def get_json(self): return json.dumps({"aggregates":self.aggregates, "scatters":self.scatters, "histograms":self.histograms}) class PrintAllKeys(): - def __init__(self, out, size): - self._size = size + def __init__(self, out, bytes): + self._bytes = bytes self._out = out self._out.write("%s,%s,%s,%s,%s,%s,%s\n" % ("database", "type", "key", "size_in_bytes", "encoding", "num_elements", "len_largest_element")) def next_record(self, record) : - if self._size is None or record.size >= int(self._size): + if self._bytes is None or record.bytes >= int(self._bytes): self._out.write("%d,%s,%s,%d,%s,%d,%d\n" % (record.database, record.type, encode_key(record.key), record.bytes, record.encoding, record.size, record.len_largest_element)) From 71241b31a6f2b7f62379847611c392696d47a189 Mon Sep 17 00:00:00 2001 From: Josh Sherman Date: Thu, 14 Nov 2013 23:58:28 -0500 Subject: [PATCH 05/41] Cleaned up if-then blocks Consolidated down to a single block of code, converted to a dictionary with lambda functions. --- rdbtools/cli/rdb.py | 46 ++++++++++++++++----------------------------- 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/rdbtools/cli/rdb.py b/rdbtools/cli/rdb.py index ce6df05..60121e0 100755 --- a/rdbtools/cli/rdb.py +++ b/rdbtools/cli/rdb.py @@ -52,39 +52,25 @@ def main(): raise Exception('Invalid type provided - %s. Expected one of %s' % (x, (", ".join(VALID_TYPES)))) else: filters['types'].append(x) - - # TODO : Fix this ugly if-else code + if options.output: - with open(options.output, "wb") as f: - if 'diff' == options.command: - callback = DiffCallback(f) - elif 'json' == options.command: - callback = JSONCallback(f) - elif 'memory' == options.command: - reporter = PrintAllKeys(f, options.bytes, options.largest) - callback = MemoryCallback(reporter, 64) - elif 'protocol' == options.command: - callback = ProtocolCallback(f) - else: - raise Exception('Invalid Command %s' % options.command) - parser = RdbParser(callback) - parser.parse(dump_file) + f = open(options.output, "wb") else: - if 'diff' == options.command: - callback = DiffCallback(sys.stdout) - elif 'json' == options.command: - callback = JSONCallback(sys.stdout) - elif 'memory' == options.command: - reporter = PrintAllKeys(sys.stdout, options.bytes, options.largest) - callback = MemoryCallback(reporter, 64) - elif 'protocol' == options.command: - callback = ProtocolCallback(sys.stdout) - else: - raise Exception('Invalid Command %s' % options.command) + f = sys.stdout + + try: + callback = { + 'diff': lambda f: DiffCallback(f), + 'json': lambda f: JSONCallback(f), + 'memory': lambda f: MemoryCallback(PrintAllKeys(f, options.bytes, options.largest), 64), + 'protocol': lambda f: ProtocolCallback(f) + }[options.command](f) + except: + raise Exception('Invalid Command %s' % options.command) + + parser = RdbParser(callback, filters=filters) + parser.parse(dump_file) - parser = RdbParser(callback, filters=filters) - parser.parse(dump_file) - if __name__ == '__main__': main() From ac8e4c7d9c3971a9aabb2e514e10949a5345a544 Mon Sep 17 00:00:00 2001 From: Josh Sherman Date: Fri, 15 Nov 2013 09:45:54 -0500 Subject: [PATCH 06/41] Fixed nlargest on every iteration Realized the fallacy of this after a good night's rest. Now running nlargest at the end after loading everything into a heapq. Not ideal as it could cause memory issues, but I was able to successfully parse a 2.5m record rdb and grab the top 10 largest keys without any issues. --- rdbtools/memprofiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdbtools/memprofiler.py b/rdbtools/memprofiler.py index 06cd055..d98f7d6 100644 --- a/rdbtools/memprofiler.py +++ b/rdbtools/memprofiler.py @@ -87,10 +87,10 @@ def next_record(self, record) : record.bytes, record.encoding, record.size, record.len_largest_element)) else: heappush(self._heap, (record.bytes, record)) - self._heap = nlargest(int(self._largest), self._heap) def dump_heap(self): if self._largest is not None: + self._heap = nlargest(int(self._largest), self._heap) self._largest = None while self._heap: From 125c3edd39fa666e553b3e3ee7364996e90aab9a Mon Sep 17 00:00:00 2001 From: yoav Date: Tue, 4 Feb 2014 18:33:34 +0200 Subject: [PATCH 07/41] Add support for reading rdb from stream and handling extra checksum bytes at the end. --- rdbtools/parser.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/rdbtools/parser.py b/rdbtools/parser.py index 1bd49d8..0919df6 100644 --- a/rdbtools/parser.py +++ b/rdbtools/parser.py @@ -263,13 +263,21 @@ def __init__(self, callback, filters = None) : self._key = None self._expiry = None self.init_filter(filters) + self._rdb_version = 0 def parse(self, filename): """ Parse a redis rdb dump file, and call methods in the callback object during the parsing operation. """ - with open(filename, "rb") as f: + self.parse_fd(open(filename, "rb")) + + def parse_fd(self, fd): + """ + Parse a redis rdb dump from file object, and call methods in the + callback object during the parsing operation. + """ + with fd as f: self.verify_magic_string(f.read(5)) self.verify_version(f.read(4)) self._callback.start_rdb() @@ -298,6 +306,8 @@ def parse(self, filename): if data_type == REDIS_RDB_OPCODE_EOF : self._callback.end_database(db_number) self._callback.end_rdb() + if self._rdb_version >= 5: + f.read(8) break if self.matches_filter(db_number) : @@ -610,6 +620,7 @@ def verify_version(self, version_str) : version = int(version_str) if version < 1 or version > 6 : raise Exception('verify_version', 'Invalid RDB version number %d' % version) + self._rdb_version = version def init_filter(self, filters): self._filters = {} From 05947e627d9ba3d93f0d5839a114a2adce951e2c Mon Sep 17 00:00:00 2001 From: yoav Date: Tue, 4 Feb 2014 18:36:24 +0200 Subject: [PATCH 08/41] Fixed copy paste typo --- rdbtools/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdbtools/parser.py b/rdbtools/parser.py index 0919df6..5586dd5 100644 --- a/rdbtools/parser.py +++ b/rdbtools/parser.py @@ -468,7 +468,7 @@ def skip_object(self, f, enc_type): elif enc_type == REDIS_RDB_TYPE_HASH_ZIPLIST : skip_strings = 1 else : - raise Exception('read_object', 'Invalid object type %d for key %s' % (enc_type, self._key)) + raise Exception('skip_object', 'Invalid object type %d for key %s' % (enc_type, self._key)) for x in xrange(0, skip_strings): self.skip_string(f) From 094efbc0100db53891f715a5f989316192f161d4 Mon Sep 17 00:00:00 2001 From: yoav Date: Sun, 30 Mar 2014 19:09:49 +0300 Subject: [PATCH 09/41] invalid indent --- rdbtools/parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rdbtools/parser.py b/rdbtools/parser.py index 5586dd5..7214b31 100644 --- a/rdbtools/parser.py +++ b/rdbtools/parser.py @@ -306,8 +306,8 @@ def parse_fd(self, fd): if data_type == REDIS_RDB_OPCODE_EOF : self._callback.end_database(db_number) self._callback.end_rdb() - if self._rdb_version >= 5: - f.read(8) + if self._rdb_version >= 5: + f.read(8) break if self.matches_filter(db_number) : From f2352316c1283dd71eb6479ae722d7b1dc91bd55 Mon Sep 17 00:00:00 2001 From: yoav Date: Sun, 13 Apr 2014 18:33:42 +0300 Subject: [PATCH 10/41] Remove index printing from sorted set diff callback (was useless and didn't procude usable output). --- rdbtools/callbacks.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/rdbtools/callbacks.py b/rdbtools/callbacks.py index a361d02..3943184 100644 --- a/rdbtools/callbacks.py +++ b/rdbtools/callbacks.py @@ -247,12 +247,11 @@ def end_list(self, key): pass def start_sorted_set(self, key, length, expiry, info): - self._index = 0 - + pass + def zadd(self, key, score, member): - self._out.write('db=%d %s[%d] -> {%s, score=%s}' % (self._dbnum, encode_key(key), self._index, encode_key(member), encode_value(score))) + self._out.write('db=%d %s -> {%s, score=%s}' % (self._dbnum, encode_key(key), encode_key(member), encode_value(score))) self.newline() - self._index = self._index + 1 def end_sorted_set(self, key): pass From 5778ad9851df5af39b76b3ae6cb0c33f7ae6633a Mon Sep 17 00:00:00 2001 From: yoav Date: Mon, 18 Aug 2014 16:32:53 +0300 Subject: [PATCH 11/41] Correctly parse nan/inf values in (skiplist) zset scores --- rdbtools/parser.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/rdbtools/parser.py b/rdbtools/parser.py index 7214b31..b5462af 100644 --- a/rdbtools/parser.py +++ b/rdbtools/parser.py @@ -360,6 +360,19 @@ def read_string(self, f) : val = f.read(length) return val + def read_float(self, f): + dbl_length = read_unsigned_char(f) + if dbl_length == 253: + return float('nan') + elif dbl_length == 254: + return float('inf') + elif dbl_length == 255: + return float('-inf') + data = f.read(dbl_length) + if isinstance(data, str): + return float(data) + return data # bug? + # Read an object for the stream # f is the redis file # enc_type is the type of object @@ -393,10 +406,7 @@ def read_object(self, f, enc_type) : self._callback.start_sorted_set(self._key, length, self._expiry, info={'encoding':'skiplist'}) for count in xrange(0, length) : val = self.read_string(f) - dbl_length = read_unsigned_char(f) - score = f.read(dbl_length) - if isinstance(score, str): - score = float(score) + score = self.read_float(f) self._callback.zadd(self._key, score, val) self._callback.end_sorted_set(self._key) elif enc_type == REDIS_RDB_TYPE_HASH : From 7c68438b754d63cdac440558acc10a8dc4d71530 Mon Sep 17 00:00:00 2001 From: Adi Date: Mon, 15 Sep 2014 14:05:50 +0300 Subject: [PATCH 12/41] added parse_stream method that parses file like objects --- rdbtools/parser.py | 79 +++++++++++++++++++++++-------------------- tests/parser_tests.py | 13 +++++++ 2 files changed, 56 insertions(+), 36 deletions(-) diff --git a/rdbtools/parser.py b/rdbtools/parser.py index 1bd49d8..1f2457a 100644 --- a/rdbtools/parser.py +++ b/rdbtools/parser.py @@ -270,44 +270,51 @@ def parse(self, filename): callback object during the parsing operation. """ with open(filename, "rb") as f: - self.verify_magic_string(f.read(5)) - self.verify_version(f.read(4)) - self._callback.start_rdb() + self.parse_stream(f) + + def parse_stream(self, stream): + """ + Parse a redis rdb dump stream, and call methods in the + callback object during the parsing operation. + """ + self.verify_magic_string(stream.read(5)) + self.verify_version(stream.read(4)) + self._callback.start_rdb() + + is_first_database = True + db_number = 0 + while True : + self._expiry = None + data_type = read_unsigned_char(stream) - is_first_database = True - db_number = 0 - while True : - self._expiry = None - data_type = read_unsigned_char(f) - - if data_type == REDIS_RDB_OPCODE_EXPIRETIME_MS : - self._expiry = to_datetime(read_unsigned_long(f) * 1000) - data_type = read_unsigned_char(f) - elif data_type == REDIS_RDB_OPCODE_EXPIRETIME : - self._expiry = to_datetime(read_unsigned_int(f) * 1000000) - data_type = read_unsigned_char(f) - - if data_type == REDIS_RDB_OPCODE_SELECTDB : - if not is_first_database : - self._callback.end_database(db_number) - is_first_database = False - db_number = self.read_length(f) - self._callback.start_database(db_number) - continue - - if data_type == REDIS_RDB_OPCODE_EOF : + if data_type == REDIS_RDB_OPCODE_EXPIRETIME_MS : + self._expiry = to_datetime(read_unsigned_long(stream) * 1000) + data_type = read_unsigned_char(stream) + elif data_type == REDIS_RDB_OPCODE_EXPIRETIME : + self._expiry = to_datetime(read_unsigned_int(stream) * 1000000) + data_type = read_unsigned_char(stream) + + if data_type == REDIS_RDB_OPCODE_SELECTDB : + if not is_first_database : self._callback.end_database(db_number) - self._callback.end_rdb() - break - - if self.matches_filter(db_number) : - self._key = self.read_string(f) - if self.matches_filter(db_number, self._key, data_type): - self.read_object(f, data_type) - else: - self.skip_object(f, data_type) - else : - self.skip_key_and_object(f, data_type) + is_first_database = False + db_number = self.read_length(stream) + self._callback.start_database(db_number) + continue + + if data_type == REDIS_RDB_OPCODE_EOF : + self._callback.end_database(db_number) + self._callback.end_rdb() + break + + if self.matches_filter(db_number) : + self._key = self.read_string(stream) + if self.matches_filter(db_number, self._key, data_type): + self.read_object(stream, data_type) + else: + self.skip_object(stream, data_type) + else : + self.skip_key_and_object(stream, data_type) def read_length_with_encoding(self, f) : length = 0 diff --git a/tests/parser_tests.py b/tests/parser_tests.py index cc09e78..53dbd23 100644 --- a/tests/parser_tests.py +++ b/tests/parser_tests.py @@ -189,6 +189,13 @@ def test_rdb_version_5_with_checksum(self): self.assertEquals(r.databases[0]['abcdef'], 'abcdef') self.assertEquals(r.databases[0]['longerstring'], 'thisisalongerstring.idontknowwhatitmeans') + def test_multiple_databases_stream(self): + r = load_rdb_stream('multiple_databases.rdb') + self.assert_(len(r.databases), 2) + self.assert_(1 not in r.databases) + self.assertEquals(r.databases[0]["key_in_zeroth_database"], "zero") + self.assertEquals(r.databases[2]["key_in_second_database"], "second") + def floateq(f1, f2) : return math.fabs(f1 - f2) < 0.00001 @@ -197,6 +204,12 @@ def load_rdb(file_name, filters=None) : parser = RdbParser(r, filters) parser.parse(os.path.join(os.path.dirname(__file__), 'dumps', file_name)) return r + +def load_rdb_stream(file_name, filters=None) : + r = MockRedis() + parser = RdbParser(r, filters) + parser.parse_stream(open(os.path.join(os.path.dirname(__file__), 'dumps', file_name), 'rb')) + return r class MockRedis(RdbCallback): def __init__(self) : From 822aeabd5c9acc47d4451351a0cc7a3fea26ca44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20VOLLE?= Date: Thu, 9 Jul 2015 17:35:02 +0200 Subject: [PATCH 13/41] Update all base classes to new style class New style classes allow for using the super() operator in inheriting classes and are generally considered better. --- rdbtools/cli/redis_memory_for_key.py | 2 +- rdbtools/memprofiler.py | 4 ++-- rdbtools/parser.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/rdbtools/cli/redis_memory_for_key.py b/rdbtools/cli/redis_memory_for_key.py index 66ff244..8711e58 100755 --- a/rdbtools/cli/redis_memory_for_key.py +++ b/rdbtools/cli/redis_memory_for_key.py @@ -83,7 +83,7 @@ def check_redis_version(redis): def read_unsigned_char(f) : return struct.unpack('B', f.read(1))[0] -class PrintMemoryUsage(): +class PrintMemoryUsage(object): def next_record(self, record) : print("%s\t\t\t\t%s" % ("Key", encode_key(record.key))) print("%s\t\t\t\t%s" % ("Bytes", record.bytes)) diff --git a/rdbtools/memprofiler.py b/rdbtools/memprofiler.py index 3a80ba2..3ade7e3 100644 --- a/rdbtools/memprofiler.py +++ b/rdbtools/memprofiler.py @@ -11,7 +11,7 @@ MemoryRecord = namedtuple('MemoryRecord', ['database', 'type', 'key', 'bytes', 'encoding','size', 'len_largest_element']) -class StatsAggregator(): +class StatsAggregator(object): def __init__(self, key_groupings = None): self.aggregates = {} self.scatters = {} @@ -67,7 +67,7 @@ def add_scatter(self, heading, x, y): def get_json(self): return json.dumps({"aggregates":self.aggregates, "scatters":self.scatters, "histograms":self.histograms}) -class PrintAllKeys(): +class PrintAllKeys(object): def __init__(self, out): self._out = out self._out.write("%s,%s,%s,%s,%s,%s,%s\n" % ("database", "type", "key", diff --git a/rdbtools/parser.py b/rdbtools/parser.py index 1bd49d8..9449fb2 100644 --- a/rdbtools/parser.py +++ b/rdbtools/parser.py @@ -39,7 +39,7 @@ 0 : "string", 1 : "list", 2 : "set", 3 : "sortedset", 4 : "hash", 9 : "hash", 10 : "list", 11 : "set", 12 : "sortedset", 13 : "hash"} -class RdbCallback: +class RdbCallback(object): """ A Callback to handle events as the Redis dump file is parsed. This callback provides a serial and fast access to the dump file. @@ -236,7 +236,7 @@ def end_rdb(self): """Called to indicate we have completed parsing of the dump file""" pass -class RdbParser : +class RdbParser(object): """ A Parser for Redis RDB Files From b2640bdc4db24365e8a89ec32241460e8fc5e220 Mon Sep 17 00:00:00 2001 From: Josh Owen Date: Tue, 19 Jan 2016 11:26:06 -0500 Subject: [PATCH 14/41] use ujson/cStringIO/python-lzf if they're available --- .gitignore | 1 + rdbtools/cli/redis_memory_for_key.py | 7 ++- rdbtools/memprofiler.py | 5 +- rdbtools/parser.py | 78 ++++++++++++++++------------ 4 files changed, 55 insertions(+), 36 deletions(-) diff --git a/.gitignore b/.gitignore index cf3686a..0a233ef 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ tests/dumps/dump_dealers_vins.rdb tests/dumps/dump_random_lists.rdb tests/dumps/dump_sorted_sets.rdb +.idea/* \ No newline at end of file diff --git a/rdbtools/cli/redis_memory_for_key.py b/rdbtools/cli/redis_memory_for_key.py index 66ff244..3e51725 100755 --- a/rdbtools/cli/redis_memory_for_key.py +++ b/rdbtools/cli/redis_memory_for_key.py @@ -3,8 +3,11 @@ import os import sys -try : - from StringIO import StringIO +try: + try: + from cStringIO import StringIO + except ImportError: + from StringIO import StringIO except ImportError: from io import StringIO diff --git a/rdbtools/memprofiler.py b/rdbtools/memprofiler.py index 3a80ba2..8592477 100644 --- a/rdbtools/memprofiler.py +++ b/rdbtools/memprofiler.py @@ -1,6 +1,9 @@ from collections import namedtuple import random -import json +try: + import ujson as json +except: + import json from rdbtools.parser import RdbCallback from rdbtools.callbacks import encode_key diff --git a/rdbtools/parser.py b/rdbtools/parser.py index 1bd49d8..37b5909 100644 --- a/rdbtools/parser.py +++ b/rdbtools/parser.py @@ -4,10 +4,19 @@ import datetime import re -try : - from StringIO import StringIO +try: + try: + from cStringIO import StringIO + except ImportError: + from StringIO import StringIO except ImportError: from io import StringIO + +try: + import lzf + HAS_PYTHON_LZF = True +except ImportError: + HAS_PYTHON_LZF = False REDIS_RDB_6BITLEN = 0 REDIS_RDB_14BITLEN = 1 @@ -653,38 +662,41 @@ def get_logical_type(self, data_type): return DATA_TYPE_MAPPING[data_type] def lzf_decompress(self, compressed, expected_length): - in_stream = bytearray(compressed) - in_len = len(in_stream) - in_index = 0 - out_stream = bytearray() - out_index = 0 - - while in_index < in_len : - ctrl = in_stream[in_index] - if not isinstance(ctrl, int) : - raise Exception('lzf_decompress', 'ctrl should be a number %s for key %s' % (str(ctrl), self._key)) - in_index = in_index + 1 - if ctrl < 32 : - for x in xrange(0, ctrl + 1) : - out_stream.append(in_stream[in_index]) - #sys.stdout.write(chr(in_stream[in_index])) - in_index = in_index + 1 - out_index = out_index + 1 - else : - length = ctrl >> 5 - if length == 7 : - length = length + in_stream[in_index] - in_index = in_index + 1 - - ref = out_index - ((ctrl & 0x1f) << 8) - in_stream[in_index] - 1 + if HAS_PYTHON_LZF: + return lzf.decompress(compressed, expected_length) + else: + in_stream = bytearray(compressed) + in_len = len(in_stream) + in_index = 0 + out_stream = bytearray() + out_index = 0 + + while in_index < in_len : + ctrl = in_stream[in_index] + if not isinstance(ctrl, int) : + raise Exception('lzf_decompress', 'ctrl should be a number %s for key %s' % (str(ctrl), self._key)) in_index = in_index + 1 - for x in xrange(0, length + 2) : - out_stream.append(out_stream[ref]) - ref = ref + 1 - out_index = out_index + 1 - if len(out_stream) != expected_length : - raise Exception('lzf_decompress', 'Expected lengths do not match %d != %d for key %s' % (len(out_stream), expected_length, self._key)) - return str(out_stream) + if ctrl < 32 : + for x in xrange(0, ctrl + 1) : + out_stream.append(in_stream[in_index]) + #sys.stdout.write(chr(in_stream[in_index])) + in_index = in_index + 1 + out_index = out_index + 1 + else : + length = ctrl >> 5 + if length == 7 : + length = length + in_stream[in_index] + in_index = in_index + 1 + + ref = out_index - ((ctrl & 0x1f) << 8) - in_stream[in_index] - 1 + in_index = in_index + 1 + for x in xrange(0, length + 2) : + out_stream.append(out_stream[ref]) + ref = ref + 1 + out_index = out_index + 1 + if len(out_stream) != expected_length : + raise Exception('lzf_decompress', 'Expected lengths do not match %d != %d for key %s' % (len(out_stream), expected_length, self._key)) + return str(out_stream) def skip(f, free): if free : From 58364a5e0d06d1590b8973f4aad8e106e580526e Mon Sep 17 00:00:00 2001 From: Bo Blanton Date: Thu, 11 Aug 2016 13:29:14 -0700 Subject: [PATCH 15/41] add more "filters" (not keys) and more Print for just Keys and Key/Values --- rdbtools/__init__.py | 4 ++-- rdbtools/cli/rdb.py | 16 +++++++++++++--- rdbtools/memprofiler.py | 16 ++++++++++++++++ rdbtools/parser.py | 7 +++++++ 4 files changed, 38 insertions(+), 5 deletions(-) diff --git a/rdbtools/__init__.py b/rdbtools/__init__.py index e3fe8b2..d0932b6 100644 --- a/rdbtools/__init__.py +++ b/rdbtools/__init__.py @@ -1,10 +1,10 @@ from rdbtools.parser import RdbCallback, RdbParser, DebugCallback from rdbtools.callbacks import JSONCallback, DiffCallback, ProtocolCallback -from rdbtools.memprofiler import MemoryCallback, PrintAllKeys, StatsAggregator +from rdbtools.memprofiler import MemoryCallback, PrintAllKeys, StatsAggregator, PrintJustKeys, PrintJustKeyVals __version__ = '0.1.6' VERSION = tuple(map(int, __version__.split('.'))) __all__ = [ - 'RdbParser', 'RdbCallback', 'JSONCallback', 'DiffCallback', 'MemoryCallback', 'ProtocolCallback', 'PrintAllKeys'] + 'RdbParser', 'RdbCallback', 'JSONCallback', 'DiffCallback', 'MemoryCallback', 'ProtocolCallback', 'PrintAllKeys', 'PrintJustKeyVals'] diff --git a/rdbtools/cli/rdb.py b/rdbtools/cli/rdb.py index 348102f..44892da 100755 --- a/rdbtools/cli/rdb.py +++ b/rdbtools/cli/rdb.py @@ -2,7 +2,7 @@ import os import sys from optparse import OptionParser -from rdbtools import RdbParser, JSONCallback, DiffCallback, MemoryCallback, ProtocolCallback, PrintAllKeys +from rdbtools import RdbParser, JSONCallback, DiffCallback, MemoryCallback, ProtocolCallback, PrintAllKeys, PrintJustKeys VALID_TYPES = ("hash", "set", "string", "list", "sortedset") def main(): @@ -12,13 +12,15 @@ def main(): parser = OptionParser(usage=usage) parser.add_option("-c", "--command", dest="command", - help="Command to execute. Valid commands are json, diff, and protocol", metavar="FILE") + help="Command to execute. Valid commands are json, diff, justkeys and protocol", metavar="FILE") parser.add_option("-f", "--file", dest="output", help="Output file", metavar="FILE") parser.add_option("-n", "--db", dest="dbs", action="append", help="Database Number. Multiple databases can be provided. If not specified, all databases will be included.") parser.add_option("-k", "--key", dest="keys", default=None, help="Keys to export. This can be a regular expression") + parser.add_option("-n", "--not-key", dest="keys", default=None, + help="Keys Not to export. This can be a regular expression") parser.add_option("-t", "--type", dest="types", action="append", help="""Data types to include. Possible values are string, hash, set, sortedset, list. Multiple typees can be provided. If not specified, all data types will be returned""") @@ -40,6 +42,9 @@ def main(): if options.keys: filters['keys'] = options.keys + + if options.not_keys: + filters['not_keys'] = options.not_keys if options.types: filters['types'] = [] @@ -59,6 +64,9 @@ def main(): elif 'memory' == options.command: reporter = PrintAllKeys(f) callback = MemoryCallback(reporter, 64) + elif 'justkeys' == options.command: + reporter = PrintJustKeys(f) + callback = MemoryCallback(reporter, 64) elif 'protocol' == options.command: callback = ProtocolCallback(f) else: @@ -73,6 +81,9 @@ def main(): elif 'memory' == options.command: reporter = PrintAllKeys(sys.stdout) callback = MemoryCallback(reporter, 64) + elif 'justkeys' == options.command: + reporter = PrintJustKeys(sys.stdout) + callback = MemoryCallback(reporter, 64) elif 'protocol' == options.command: callback = ProtocolCallback(sys.stdout) else: @@ -83,4 +94,3 @@ def main(): if __name__ == '__main__': main() - diff --git a/rdbtools/memprofiler.py b/rdbtools/memprofiler.py index 3a80ba2..147339e 100644 --- a/rdbtools/memprofiler.py +++ b/rdbtools/memprofiler.py @@ -76,7 +76,23 @@ def __init__(self, out): def next_record(self, record) : self._out.write("%d,%s,%s,%d,%s,%d,%d\n" % (record.database, record.type, encode_key(record.key), record.bytes, record.encoding, record.size, record.len_largest_element)) + +class PrintJustKeys(): + def __init__(self, out): + self._out = out + + def next_record(self, record) : + self._out.write("%s\n" % encode_key(record.key)) + + +class PrintJustKeyVals(): + def __init__(self, out): + self._out = out + def next_record(self, record) : + self._out.write("%s %s\n" % encode_key(record.key)) + + class MemoryCallback(RdbCallback): '''Calculates the memory used if this rdb file were loaded into RAM The memory usage is approximate, and based on heuristics. diff --git a/rdbtools/parser.py b/rdbtools/parser.py index 1bd49d8..cabf95d 100644 --- a/rdbtools/parser.py +++ b/rdbtools/parser.py @@ -629,6 +629,11 @@ def init_filter(self, filters): self._filters['keys'] = re.compile(".*") else: self._filters['keys'] = re.compile(filters['keys']) + + if not ('not_keys' in filters and filters['not_keys']): + self._filters['not_keys'] = None + else: + self._filters['not_keys'] = re.compile(filters['not_keys']) if not 'types' in filters: self._filters['types'] = ('set', 'hash', 'sortedset', 'string', 'list') @@ -642,6 +647,8 @@ def init_filter(self, filters): def matches_filter(self, db_number, key=None, data_type=None): if self._filters['dbs'] and (not db_number in self._filters['dbs']): return False + if key and self._filters['not_keys'] and (self._filters['not_keys'].match(str(key))): + return False if key and (not self._filters['keys'].match(str(key))): return False From 968411d08d2362ab0ee3735812ae8ff47c9b171e Mon Sep 17 00:00:00 2001 From: Bo Blanton Date: Thu, 11 Aug 2016 13:31:16 -0700 Subject: [PATCH 16/41] flags goober --- rdbtools/cli/rdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdbtools/cli/rdb.py b/rdbtools/cli/rdb.py index 44892da..ed38707 100755 --- a/rdbtools/cli/rdb.py +++ b/rdbtools/cli/rdb.py @@ -19,7 +19,7 @@ def main(): help="Database Number. Multiple databases can be provided. If not specified, all databases will be included.") parser.add_option("-k", "--key", dest="keys", default=None, help="Keys to export. This can be a regular expression") - parser.add_option("-n", "--not-key", dest="keys", default=None, + parser.add_option("-o", "--not-key", dest="not_keys", default=None, help="Keys Not to export. This can be a regular expression") parser.add_option("-t", "--type", dest="types", action="append", help="""Data types to include. Possible values are string, hash, set, sortedset, list. Multiple typees can be provided. From 0a127993e1e092b976a03d9ce8ed10bb8777e502 Mon Sep 17 00:00:00 2001 From: oranagra Date: Wed, 17 Aug 2016 22:12:50 +0300 Subject: [PATCH 17/41] fix test suite broken by b2e2d5e8 (Added filtering of the top X largest keys) --- rdbtools/memprofiler.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/rdbtools/memprofiler.py b/rdbtools/memprofiler.py index fba68c2..93080f0 100644 --- a/rdbtools/memprofiler.py +++ b/rdbtools/memprofiler.py @@ -92,7 +92,7 @@ def next_record(self, record) : else: heappush(self._heap, (record.bytes, record)) - def dump_heap(self): + def end_rdb(self): if self._largest is not None: self._heap = nlargest(int(self._largest), self._heap) self._largest = None @@ -150,10 +150,13 @@ def end_database(self, db_number): self._stream.next_record(record) record = MemoryRecord(self._dbnum, "dict", None, self.hashtable_overhead(self._db_expires), None, None, None) self._stream.next_record(record) + if hasattr(self._stream, 'end_database'): + self._stream.end_database(db_number) def end_rdb(self): #print('internal fragmentation: %s' % self._total_internal_frag) - self._stream.dump_heap() + if hasattr(self._stream, 'end_rdb'): + self._stream.end_rdb() def set(self, key, value, expiry, info): self._current_encoding = info['encoding'] From 98902c5d6b0d4a55742d380401899cac8d9d488e Mon Sep 17 00:00:00 2001 From: oranagra Date: Thu, 18 Aug 2016 18:13:17 +0300 Subject: [PATCH 18/41] setup.py metadata updates --- setup.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 3efc6ab..74b376d 100644 --- a/setup.py +++ b/setup.py @@ -23,11 +23,11 @@ 'description' : 'Utilities to convert Redis RDB files to JSON or SQL formats', 'long_description' : long_description, 'url': 'https://github.com/sripathikrishnan/redis-rdb-tools', - 'download_url' : 'http://cloud.github.com/downloads/andymccurdy/redis-py/redis-%s.tar.gz' % __version__, - 'author' : 'Sripathi Krishnan', + 'download_url': 'https://github.com/sripathikrishnan/redis-rdb-tools/archive/rdbtools-%s.tar.gz' % __version__, + 'author': 'Sripathi Krishnan, Redis Labs', 'author_email' : 'Sripathi.Krishnan@gmail.com', - 'maintainer' : 'Sripathi Krishnan', - 'maintainer_email' : 'Sripathi.Krishnan@gmail.com', + 'maintainer': 'Sripathi Krishnan, Redis Labs', + 'maintainer_email': 'oss@redislabs.com', 'keywords' : ['Redis', 'RDB', 'Export', 'Dump', 'Memory Profiler'], 'license' : 'MIT', 'packages' : ['rdbtools', 'rdbtools.cli'], From 89f030f9e0da422ee0bed51e0dbcec2a8302b506 Mon Sep 17 00:00:00 2001 From: Bo Blanton Date: Wed, 24 Aug 2016 17:17:42 -0700 Subject: [PATCH 19/41] upstream merge mistakes --- rdbtools/cli/rdb.py | 60 ++++----- rdbtools/memprofiler.py | 286 ++++++++++++++++++++++++++++++++-------- 2 files changed, 253 insertions(+), 93 deletions(-) diff --git a/rdbtools/cli/rdb.py b/rdbtools/cli/rdb.py index ed38707..2dc91f0 100755 --- a/rdbtools/cli/rdb.py +++ b/rdbtools/cli/rdb.py @@ -12,7 +12,7 @@ def main(): parser = OptionParser(usage=usage) parser.add_option("-c", "--command", dest="command", - help="Command to execute. Valid commands are json, diff, justkeys and protocol", metavar="FILE") + help="Command to execute. Valid commands are json, diff, justkeys, justkeyvals and protocol", metavar="FILE") parser.add_option("-f", "--file", dest="output", help="Output file", metavar="FILE") parser.add_option("-n", "--db", dest="dbs", action="append", @@ -24,6 +24,10 @@ def main(): parser.add_option("-t", "--type", dest="types", action="append", help="""Data types to include. Possible values are string, hash, set, sortedset, list. Multiple typees can be provided. If not specified, all data types will be returned""") + parser.add_option("-b", "--bytes", dest="bytes", default=None, + help="Limit memory output to keys greater to or equal to this value (in bytes)") + parser.add_option("-l", "--largest", dest="largest", default=None, + help="Limit memory output to only the top N keys (by size)") (options, args) = parser.parse_args() @@ -53,44 +57,26 @@ def main(): raise Exception('Invalid type provided - %s. Expected one of %s' % (x, (", ".join(VALID_TYPES)))) else: filters['types'].append(x) - - # TODO : Fix this ugly if-else code + if options.output: - with open(options.output, "wb") as f: - if 'diff' == options.command: - callback = DiffCallback(f) - elif 'json' == options.command: - callback = JSONCallback(f) - elif 'memory' == options.command: - reporter = PrintAllKeys(f) - callback = MemoryCallback(reporter, 64) - elif 'justkeys' == options.command: - reporter = PrintJustKeys(f) - callback = MemoryCallback(reporter, 64) - elif 'protocol' == options.command: - callback = ProtocolCallback(f) - else: - raise Exception('Invalid Command %s' % options.command) - parser = RdbParser(callback) - parser.parse(dump_file) + f = open(options.output, "wb") else: - if 'diff' == options.command: - callback = DiffCallback(sys.stdout) - elif 'json' == options.command: - callback = JSONCallback(sys.stdout) - elif 'memory' == options.command: - reporter = PrintAllKeys(sys.stdout) - callback = MemoryCallback(reporter, 64) - elif 'justkeys' == options.command: - reporter = PrintJustKeys(sys.stdout) - callback = MemoryCallback(reporter, 64) - elif 'protocol' == options.command: - callback = ProtocolCallback(sys.stdout) - else: - raise Exception('Invalid Command %s' % options.command) - - parser = RdbParser(callback, filters=filters) - parser.parse(dump_file) + f = sys.stdout + try: + callback = { + 'diff': lambda f: DiffCallback(f), + 'json': lambda f: JSONCallback(f), + 'justkeys': lambda f: MemoryCallback(PrintJustKeys(f), 64), + 'justkeysvals': lambda f: MemoryCallback(PrintJustKeyVals(f), 64), + 'memory': lambda f: MemoryCallback(PrintAllKeys(f, options.bytes, options.largest), 64), + 'protocol': lambda f: ProtocolCallback(f) + }[options.command](f) + except: + raise Exception('Invalid Command %s' % options.command) + + parser = RdbParser(callback, filters=filters) + parser.parse(dump_file) + if __name__ == '__main__': main() diff --git a/rdbtools/memprofiler.py b/rdbtools/memprofiler.py index 147339e..a6fc2b5 100644 --- a/rdbtools/memprofiler.py +++ b/rdbtools/memprofiler.py @@ -1,9 +1,16 @@ from collections import namedtuple import random -import json +import bisect +from distutils.version import StrictVersion +try: + import ujson as json +except: + import json from rdbtools.parser import RdbCallback -from rdbtools.callbacks import encode_key +from rdbtools.callbacks import encode_key, encode_value + +from heapq import heappush, nlargest, heappop ZSKIPLIST_MAXLEVEL=32 ZSKIPLIST_P=0.25 @@ -11,7 +18,7 @@ MemoryRecord = namedtuple('MemoryRecord', ['database', 'type', 'key', 'bytes', 'encoding','size', 'len_largest_element']) -class StatsAggregator(): +class StatsAggregator(object): def __init__(self, key_groupings = None): self.aggregates = {} self.scatters = {} @@ -38,6 +45,8 @@ def next_record(self, record): self.add_scatter('sortedset_memory_by_length', record.bytes, record.size) elif record.type == 'string': self.add_scatter('string_memory_by_length', record.bytes, record.size) + elif record.type == 'dict': + pass else: raise Exception('Invalid data type %s' % record.type) @@ -67,17 +76,35 @@ def add_scatter(self, heading, x, y): def get_json(self): return json.dumps({"aggregates":self.aggregates, "scatters":self.scatters, "histograms":self.histograms}) -class PrintAllKeys(): - def __init__(self, out): +class PrintAllKeys(object): + def __init__(self, out, bytes, largest): + self._bytes = bytes + self._largest = largest self._out = out self._out.write("%s,%s,%s,%s,%s,%s,%s\n" % ("database", "type", "key", "size_in_bytes", "encoding", "num_elements", "len_largest_element")) + + if self._largest is not None: + self._heap = [] def next_record(self, record) : - self._out.write("%d,%s,%s,%d,%s,%d,%d\n" % (record.database, record.type, encode_key(record.key), - record.bytes, record.encoding, record.size, record.len_largest_element)) + if self._largest is None: + if self._bytes is None or record.bytes >= int(self._bytes): + self._out.write("%d,%s,%s,%d,%s,%d,%d\n" % (record.database, record.type, encode_key(record.key), + record.bytes, record.encoding, record.size, record.len_largest_element)) + else: + heappush(self._heap, (record.bytes, record)) + + def end_rdb(self): + if self._largest is not None: + self._heap = nlargest(int(self._largest), self._heap) + self._largest = None -class PrintJustKeys(): + while self._heap: + bytes, record = heappop(self._heap) + self.next_record(record) + +class PrintJustKeys(object): def __init__(self, out): self._out = out @@ -85,43 +112,71 @@ def next_record(self, record) : self._out.write("%s\n" % encode_key(record.key)) -class PrintJustKeyVals(): +class PrintJustKeyVals(object): def __init__(self, out): self._out = out def next_record(self, record) : - self._out.write("%s %s\n" % encode_key(record.key)) - - + self._out.write("%s %s\n" % (encode_key(record.key), encode_value(record.value)) + + class MemoryCallback(RdbCallback): '''Calculates the memory used if this rdb file were loaded into RAM The memory usage is approximate, and based on heuristics. ''' - def __init__(self, stream, architecture): + def __init__(self, stream, architecture, redis_version='3.2'): self._stream = stream self._dbnum = 0 self._current_size = 0 self._current_encoding = None self._current_length = 0 self._len_largest_element = 0 - + self._db_keys = 0 + self._db_expires = 0 + self._aux_used_mem = None + self._aux_redis_ver = None + self._aux_redis_bits = None + self._redis_version = StrictVersion(redis_version) + self._total_internal_frag = 0 if architecture == 64 or architecture == '64': self._pointer_size = 8 + self._long_size = 8 + self._architecture = 64 elif architecture == 32 or architecture == '32': self._pointer_size = 4 + self._long_size = 4 + self._architecture = 32 def start_rdb(self): pass + def aux_field(self, key, value): + #print('aux: %s %s' % (key, value)) + if key == 'used-mem': + self._aux_used_mem = int(value) + if key == 'redis-ver': + self._aux_redis_ver = value + if key == 'redis-bits': + self._aux_redis_bits = int(value) + def start_database(self, db_number): self._dbnum = db_number + self._db_keys = 0 + self._db_expires = 0 def end_database(self, db_number): - pass - + record = MemoryRecord(self._dbnum, "dict", None, self.hashtable_overhead(self._db_keys), None, None, None) + self._stream.next_record(record) + record = MemoryRecord(self._dbnum, "dict", None, self.hashtable_overhead(self._db_expires), None, None, None) + self._stream.next_record(record) + if hasattr(self._stream, 'end_database'): + self._stream.end_database(db_number) + def end_rdb(self): - pass - + #print('internal fragmentation: %s' % self._total_internal_frag) + if hasattr(self._stream, 'end_rdb'): + self._stream.end_rdb() + def set(self, key, value, expiry, info): self._current_encoding = info['encoding'] size = self.sizeof_string(key) + self.sizeof_string(value) + self.top_level_object_overhead() @@ -184,32 +239,65 @@ def end_set(self, key): self._stream.next_record(record) self.end_key() - def start_list(self, key, length, expiry, info): - self._current_length = length + def start_list(self, key, expiry, info): + self._current_length = 0 + self._list_items_size = 0 + self._list_items_zipped_size = 0 self._current_encoding = info['encoding'] size = self.sizeof_string(key) size += 2*self.robj_overhead() size += self.top_level_object_overhead() size += self.key_expiry_overhead(expiry) - - if 'sizeof_value' in info: - size += info['sizeof_value'] - elif 'encoding' in info and info['encoding'] == 'linkedlist': - size += self.linkedlist_overhead() + + # ignore the encoding in the rdb, and predict the encoding that will be used at the target redis version + if self._redis_version >= StrictVersion('3.2'): + # default configuration of redis 3.2 + self._current_encoding = "quicklist" + self._list_max_ziplist_size = 8192 # default is -2 which means 8k + self._list_compress_depth = 0 # currently we only support no compression which is the default + self._cur_zips = 1 + self._cur_zip_size = 0 else: - raise Exception('start_list', 'Could not find encoding or sizeof_value in info object %s' % info) + # default configuration fo redis 2.8 -> 3.0 + self._current_encoding = "ziplist" + self._list_max_ziplist_entries = 512 + self._list_max_ziplist_value = 64 + self._current_size = size - def rpush(self, key, value) : - if(element_length(value) > self._len_largest_element) : + def rpush(self, key, value): + self._current_length += 1 + size = self.sizeof_string(value) if type(value) != int else 4 + + if(element_length(value) > self._len_largest_element): self._len_largest_element = element_length(value) - - if self._current_encoding == 'linkedlist': - self._current_size += self.sizeof_string(value) - self._current_size += self.linkedlist_entry_overhead() - self._current_size += self.robj_overhead() - - def end_list(self, key): + + if self._current_encoding == "ziplist": + self._list_items_zipped_size += self.ziplist_entry_overhead(value) + if self._current_length > self._list_max_ziplist_entries or size > self._list_max_ziplist_value: + self._current_encoding = "linkedlist" + elif self._current_encoding == "quicklist": + if self._cur_zip_size + size > self._list_max_ziplist_size: + self._cur_zip_size = size + self._cur_zips += 1 + else: + self._cur_zip_size += size + self._list_items_zipped_size += self.ziplist_entry_overhead(value) + self._list_items_size += size # not to be used in case of ziplist or quicklist + + def end_list(self, key, info): + if self._current_encoding == 'quicklist': + self._current_size += self.quicklist_overhead(self._cur_zips) + self._current_size += self.ziplist_header_overhead() * self._cur_zips + self._current_size += self._list_items_zipped_size + elif self._current_encoding == 'ziplist': + self._current_size += self.ziplist_header_overhead() + self._current_size += self._list_items_zipped_size + else: # linkedlist + self._current_size += self.linkedlist_entry_overhead() * self._current_length + self._current_size += self.linkedlist_overhead() + self._current_size += self.robj_overhead() * self._current_length + self._current_size += self._list_items_size record = MemoryRecord(self._dbnum, "list", key, self._current_size, self._current_encoding, self._current_length, self._len_largest_element) self._stream.next_record(record) self.end_key() @@ -246,18 +334,13 @@ def end_sorted_set(self, key): self.end_key() def end_key(self): + self._db_keys += 1 self._current_encoding = None self._current_size = 0 self._len_largest_element = 0 def sizeof_string(self, string): - # See struct sdshdr over here https://github.com/antirez/redis/blob/unstable/src/sds.h - # int len : 4 bytes - # int free : 4 bytes - # char buf[] : size will be the length of the string - # 1 extra byte is used to store the null character at the end of the string - # Redis internally stores integers as a long - # Integers less than REDIS_SHARED_INTEGERS are stored in a shared memory pool + # https://github.com/antirez/redis/blob/unstable/src/sds.h try: num = int(string) if num < REDIS_SHARED_INTEGERS : @@ -266,7 +349,18 @@ def sizeof_string(self, string): return 8 except ValueError: pass - return len(string) + 8 + 1 + self.malloc_overhead() + l = len(string) + if self._redis_version < StrictVersion('3.2'): + return self.malloc_overhead(l + 8 + 1) + if l < 2**5: + return self.malloc_overhead(l + 1 + 1) + if l < 2**8: + return self.malloc_overhead(l + 1 + 2 + 1) + if l < 2**16: + return self.malloc_overhead(l + 1 + 4 + 1) + if l < 2**32: + return self.malloc_overhead(l + 1 + 8 + 1) + return self.malloc_overhead(l + 1 + 16 + 1) def top_level_object_overhead(self): # Each top level object is an entry in a dictionary, and so we have to include @@ -277,6 +371,7 @@ def key_expiry_overhead(self, expiry): # If there is no expiry, there isn't any overhead if not expiry: return 0 + self._db_expires += 1 # Key expiry is stored in a hashtable, so we have to pay for the cost of a hashtable entry # The timestamp itself is stored as an int64, which is a 8 bytes return self.hashtable_entry_overhead() + 8 @@ -284,31 +379,69 @@ def key_expiry_overhead(self, expiry): def hashtable_overhead(self, size): # See https://github.com/antirez/redis/blob/unstable/src/dict.h # See the structures dict and dictht - # 2 * (3 unsigned longs + 1 pointer) + 2 ints + 2 pointers - # = 56 + 4 * sizeof_pointer() + # 2 * (3 unsigned longs + 1 pointer) + int + long + 2 pointers # # Additionally, see **table in dictht # The length of the table is the next power of 2 # When the hashtable is rehashing, another instance of **table is created - # We are assuming 0.5 percent probability of rehashing, and so multiply + # Due to the possibility of rehashing during loading, we calculate the worse + # case in which both tables are allocated, and so multiply # the size of **table by 1.5 - return 56 + 4*self.sizeof_pointer() + self.next_power(size)*self.sizeof_pointer()*1.5 + return 4 + 7*self.sizeof_long() + 4*self.sizeof_pointer() + self.next_power(size)*self.sizeof_pointer()*1.5 def hashtable_entry_overhead(self): # See https://github.com/antirez/redis/blob/unstable/src/dict.h - # Each dictEntry has 3 pointers - return 3*self.sizeof_pointer() + # Each dictEntry has 2 pointers + int64 + return 2*self.sizeof_pointer() + 8 def linkedlist_overhead(self): # See https://github.com/antirez/redis/blob/unstable/src/adlist.h # A list has 5 pointers + an unsigned long - return 8 + 5*self.sizeof_pointer() - + return self.sizeof_long() + 5*self.sizeof_pointer() + + def quicklist_overhead(self, zip_count): + quicklist = 2*self.sizeof_pointer()+self.sizeof_long()+2*4 + quickitem = 4*self.sizeof_pointer()+self.sizeof_long()+2*4 + return quicklist + zip_count*quickitem + def linkedlist_entry_overhead(self): # See https://github.com/antirez/redis/blob/unstable/src/adlist.h # A node has 3 pointers return 3*self.sizeof_pointer() - + + def ziplist_header_overhead(self): + # See https://github.com/antirez/redis/blob/unstable/src/ziplist.c + # + return 4 + 4 + 2 + 1 + + def ziplist_entry_overhead(self, value): + # See https://github.com/antirez/redis/blob/unstable/src/ziplist.c + if type(value) == int: + header = 1 + if value < 12: + size = 0 + elif value < 2**8: + size = 1 + elif value < 2**16: + size = 2 + elif value < 2**24: + size = 3 + elif value < 2**32: + size = 4 + else: + size = 8 + else: + size = len(value) + if size <= 63: + header = 1 + elif size <= 16383: + header = 2 + else: + header = 5 + # add len again for prev_len of the next record + prev_len = 1 if size < 254 else 5 + return prev_len + header + size + def skiplist_overhead(self, size): return 2*self.sizeof_pointer() + self.hashtable_overhead(size) + (2*self.sizeof_pointer() + 16) @@ -318,8 +451,10 @@ def skiplist_entry_overhead(self): def robj_overhead(self): return self.sizeof_pointer() + 8 - def malloc_overhead(self): - return self.size_t() + def malloc_overhead(self, size): + alloc = get_jemalloc_allocation(size) + self._total_internal_frag += alloc - size + return alloc def size_t(self): return self.sizeof_pointer() @@ -327,6 +462,9 @@ def size_t(self): def sizeof_pointer(self): return self._pointer_size + def sizeof_long(self): + return self._long_size + def next_power(self, size): power = 1 while (power <= size) : @@ -344,7 +482,6 @@ def zset_random_level(self): else: return ZSKIPLIST_MAXLEVEL - def element_length(element): if isinstance(element, int): return 8 @@ -352,4 +489,41 @@ def element_length(element): return 16 else: return len(element) - + + +# size classes from jemalloc 4.0.4 using LG_QUANTUM=3 +jemalloc_size_classes = [ + 8, 16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320, 384, 448, 512, 640, 768, 896, 1024, + 1280, 1536, 1792, 2048, 2560, 3072, 3584, 4096, 5120, 6144, 7168, 8192, 10240, 12288, 14336, 16384, 20480, 24576, + 28672, 32768, 40960, 49152, 57344, 65536, 81920, 98304, 114688,131072, 163840, 196608, 229376, 262144, 327680, + 393216, 458752, 524288, 655360, 786432, 917504, 1048576, 1310720, 1572864, 1835008, 2097152, 2621440, 3145728, + 3670016, 4194304, 5242880, 6291456, 7340032, 8388608, 10485760, 12582912, 14680064, 16777216, 20971520, 25165824, + 29360128, 33554432, 41943040, 50331648, 58720256, 67108864, 83886080, 100663296, 117440512, 134217728, 167772160, + 201326592, 234881024, 268435456, 335544320, 402653184, 469762048, 536870912, 671088640, 805306368, 939524096, + 1073741824, 1342177280, 1610612736, 1879048192, 2147483648, 2684354560, 3221225472, 3758096384, 4294967296, + 5368709120, 6442450944, 7516192768, 8589934592, 10737418240, 12884901888, 15032385536, 17179869184, 21474836480, + 25769803776, 30064771072, 34359738368, 42949672960, 51539607552, 60129542144, 68719476736, 85899345920, + 103079215104, 120259084288, 137438953472, 171798691840, 206158430208, 240518168576, 274877906944, 343597383680, + 412316860416, 481036337152, 549755813888, 687194767360, 824633720832, 962072674304, 1099511627776,1374389534720, + 1649267441664, 1924145348608, 2199023255552, 2748779069440, 3298534883328, 3848290697216, 4398046511104, + 5497558138880, 6597069766656, 7696581394432, 8796093022208, 10995116277760, 13194139533312, 15393162788864, + 17592186044416, 21990232555520, 26388279066624, 30786325577728, 35184372088832, 43980465111040, 52776558133248, + 61572651155456, 70368744177664, 87960930222080, 105553116266496, 123145302310912, 140737488355328, 175921860444160, + 211106232532992, 246290604621824, 281474976710656, 351843720888320, 422212465065984, 492581209243648, + 562949953421312, 703687441776640, 844424930131968, 985162418487296, 1125899906842624, 1407374883553280, + 1688849860263936, 1970324836974592, 2251799813685248, 2814749767106560, 3377699720527872, 3940649673949184, + 4503599627370496, 5629499534213120, 6755399441055744, 7881299347898368, 9007199254740992, 11258999068426240, + 13510798882111488, 15762598695796736, 18014398509481984, 22517998136852480, 27021597764222976,31525197391593472, + 36028797018963968, 45035996273704960, 54043195528445952, 63050394783186944, 72057594037927936, 90071992547409920, + 108086391056891904, 126100789566373888, 144115188075855872, 180143985094819840, 216172782113783808, + 252201579132747776, 288230376151711744, 360287970189639680, 432345564227567616, 504403158265495552, + 576460752303423488, 720575940379279360, 864691128455135232, 1008806316530991104, 1152921504606846976, + 1441151880758558720, 1729382256910270464, 2017612633061982208, 2305843009213693952, 2882303761517117440, + 3458764513820540928, 4035225266123964416, 4611686018427387904, 5764607523034234880, 6917529027641081856, + 8070450532247928832, 9223372036854775808, 11529215046068469760, 13835058055282163712, 16140901064495857664 +] # TODO: use different table depending oon the redis-version used + +def get_jemalloc_allocation(size): + idx = bisect.bisect_left(jemalloc_size_classes, size) + alloc = jemalloc_size_classes[idx] if idx < len(jemalloc_size_classes) else size + return alloc From ef76578a93619d8e592f2f383a30fd37c5c3f86f Mon Sep 17 00:00:00 2001 From: Bo Blanton Date: Wed, 24 Aug 2016 17:19:52 -0700 Subject: [PATCH 20/41] parns --- rdbtools/memprofiler.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rdbtools/memprofiler.py b/rdbtools/memprofiler.py index a6fc2b5..02321d1 100644 --- a/rdbtools/memprofiler.py +++ b/rdbtools/memprofiler.py @@ -108,7 +108,7 @@ class PrintJustKeys(object): def __init__(self, out): self._out = out - def next_record(self, record) : + def next_record(self, record): self._out.write("%s\n" % encode_key(record.key)) @@ -116,10 +116,10 @@ class PrintJustKeyVals(object): def __init__(self, out): self._out = out - def next_record(self, record) : - self._out.write("%s %s\n" % (encode_key(record.key), encode_value(record.value)) + def next_record(self, record): + self._out.write("%s %s\n" % (encode_key(record.key), encode_value(record.value))) - + class MemoryCallback(RdbCallback): '''Calculates the memory used if this rdb file were loaded into RAM The memory usage is approximate, and based on heuristics. From f132a5e7a2872e944927effdf1c644c30abbbcad Mon Sep 17 00:00:00 2001 From: Bo Blanton Date: Wed, 24 Aug 2016 17:21:16 -0700 Subject: [PATCH 21/41] justkeyvals not justkeysvals --- rdbtools/cli/rdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdbtools/cli/rdb.py b/rdbtools/cli/rdb.py index 2dc91f0..b5d4822 100755 --- a/rdbtools/cli/rdb.py +++ b/rdbtools/cli/rdb.py @@ -68,7 +68,7 @@ def main(): 'diff': lambda f: DiffCallback(f), 'json': lambda f: JSONCallback(f), 'justkeys': lambda f: MemoryCallback(PrintJustKeys(f), 64), - 'justkeysvals': lambda f: MemoryCallback(PrintJustKeyVals(f), 64), + 'justkeyvals': lambda f: MemoryCallback(PrintJustKeyVals(f), 64), 'memory': lambda f: MemoryCallback(PrintAllKeys(f, options.bytes, options.largest), 64), 'protocol': lambda f: ProtocolCallback(f) }[options.command](f) From c6d1b33ef582a670d2ffe729a7877cc597a20013 Mon Sep 17 00:00:00 2001 From: Bo Blanton Date: Wed, 24 Aug 2016 17:24:04 -0700 Subject: [PATCH 22/41] include --- rdbtools/cli/rdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdbtools/cli/rdb.py b/rdbtools/cli/rdb.py index b5d4822..cec799e 100755 --- a/rdbtools/cli/rdb.py +++ b/rdbtools/cli/rdb.py @@ -2,7 +2,7 @@ import os import sys from optparse import OptionParser -from rdbtools import RdbParser, JSONCallback, DiffCallback, MemoryCallback, ProtocolCallback, PrintAllKeys, PrintJustKeys +from rdbtools import RdbParser, JSONCallback, DiffCallback, MemoryCallback, ProtocolCallback, PrintAllKeys, PrintJustKeys, PrintJustKeyVals VALID_TYPES = ("hash", "set", "string", "list", "sortedset") def main(): From a39d3782763575d8bc2ee7ab09460ef7e6f2ed0b Mon Sep 17 00:00:00 2001 From: Bo Blanton Date: Wed, 24 Aug 2016 17:43:51 -0700 Subject: [PATCH 23/41] =?UTF-8?q?move=20things=20to=20callbacks=20as=20tha?= =?UTF-8?q?t=E2=80=99s=20better?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- rdbtools/__init__.py | 6 +-- rdbtools/callbacks.py | 110 ++++++++++++++++++++++++++++++++++++++++ rdbtools/cli/rdb.py | 6 +-- rdbtools/memprofiler.py | 8 --- 4 files changed, 116 insertions(+), 14 deletions(-) diff --git a/rdbtools/__init__.py b/rdbtools/__init__.py index 8ca22cc..9d4e0b8 100644 --- a/rdbtools/__init__.py +++ b/rdbtools/__init__.py @@ -1,10 +1,10 @@ from rdbtools.parser import RdbCallback, RdbParser, DebugCallback -from rdbtools.callbacks import JSONCallback, DiffCallback, ProtocolCallback -from rdbtools.memprofiler import MemoryCallback, PrintAllKeys, StatsAggregator, PrintJustKeys, PrintJustKeyVals +from rdbtools.callbacks import JSONCallback, DiffCallback, ProtocolCallback, KeyValsOnlyCallback, KeysOnlyCallback +from rdbtools.memprofiler import MemoryCallback, PrintAllKeys, StatsAggregator, PrintJustKeys __version__ = '0.1.7' VERSION = tuple(map(int, __version__.split('.'))) __all__ = [ - 'RdbParser', 'RdbCallback', 'JSONCallback', 'DiffCallback', 'MemoryCallback', 'ProtocolCallback', 'PrintAllKeys', 'PrintJustKeyVals'] + 'RdbParser', 'RdbCallback', 'JSONCallback', 'DiffCallback', 'MemoryCallback', 'ProtocolCallback', 'KeyValsOnlyCallback', 'KeysOnlyCallback', 'PrintJustKeys'] diff --git a/rdbtools/callbacks.py b/rdbtools/callbacks.py index 8c067ab..3fc3f49 100644 --- a/rdbtools/callbacks.py +++ b/rdbtools/callbacks.py @@ -190,6 +190,116 @@ def zadd(self, key, score, member): def end_sorted_set(self, key): self._end_key(key) self._out.write('}') + + +class KeysOnlyCallback(RdbCallback): + def __init__(self, out): + self._out = out + + def _keyout(self, key): + self._out.write('%s\n' % (encode_key(key))) + + def set(self, key, value, expiry, info): + self._keyout(key) + + def start_hash(self, key, length, expiry, info): + self._keyout(key) + + def hset(self, key, field, value): + self._keyout(key) + + def start_set(self, key, cardinality, expiry, info): + self._keyout(key) + + def sadd(self, key, member): + self._keyout(key) + + def start_list(self, key, expiry, info): + self._keyout(key) + + def rpush(self, key, value) : + self._keyout(key) + + def start_sorted_set(self, key, length, expiry, info): + self._keyout(key) + + def zadd(self, key, score, member): + self._keyout(key) + + +class KeyValsOnlyCallback(RdbCallback): + def __init__(self, out): + self._out = out + self._is_first_db = True + self._has_databases = False + self._is_first_key_in_db = True + self._elements_in_key = 0 + self._element_index = 0 + + def _start_key(self, key, length): + if not self._is_first_key_in_db: + self._out.write(',') + self._out.write('\r\n') + self._is_first_key_in_db = False + self._elements_in_key = length + self._element_index = 0 + + def _end_key(self, key): + pass + + def _write_comma(self): + if self._element_index > 0 and self._element_index < self._elements_in_key : + self._out.write(',') + self._element_index = self._element_index + 1 + + def set(self, key, value, expiry, info): + self._start_key(key, 0) + self._out.write('%s %s' % (encode_key(key), encode_value(value))) + + def start_hash(self, key, length, expiry, info): + self._start_key(key, length) + self._out.write('%s ' % encode_key(key)) + + def hset(self, key, field, value): + self._write_comma() + self._out.write('%s %s' % (encode_key(field), encode_value(value))) + + def end_hash(self, key): + self._end_key(key) + + def start_set(self, key, cardinality, expiry, info): + self._start_key(key, cardinality) + self._out.write('%s ' % encode_key(key)) + + def sadd(self, key, member): + self._write_comma() + self._out.write('%s' % encode_value(member)) + + def end_set(self, key): + self._end_key(key) + + def start_list(self, key, expiry, info): + self._start_key(key, 0) + self._out.write('%s ' % encode_key(key)) + + def rpush(self, key, value) : + self._elements_in_key += 1 + self._write_comma() + self._out.write('%s' % encode_value(value)) + + def end_list(self, key, info): + self._end_key(key) + + def start_sorted_set(self, key, length, expiry, info): + self._start_key(key, length) + self._out.write('%s ' % encode_key(key)) + + def zadd(self, key, score, member): + self._write_comma() + self._out.write('%s %s' % (encode_key(member), encode_value(score))) + + def end_sorted_set(self, key): + self._end_key(key) class DiffCallback(RdbCallback): diff --git a/rdbtools/cli/rdb.py b/rdbtools/cli/rdb.py index cec799e..32d7b3e 100755 --- a/rdbtools/cli/rdb.py +++ b/rdbtools/cli/rdb.py @@ -2,7 +2,7 @@ import os import sys from optparse import OptionParser -from rdbtools import RdbParser, JSONCallback, DiffCallback, MemoryCallback, ProtocolCallback, PrintAllKeys, PrintJustKeys, PrintJustKeyVals +from rdbtools import RdbParser, JSONCallback, DiffCallback, MemoryCallback, ProtocolCallback, PrintAllKeys, KeysOnlyCallback, KeyValsOnlyCallback VALID_TYPES = ("hash", "set", "string", "list", "sortedset") def main(): @@ -67,8 +67,8 @@ def main(): callback = { 'diff': lambda f: DiffCallback(f), 'json': lambda f: JSONCallback(f), - 'justkeys': lambda f: MemoryCallback(PrintJustKeys(f), 64), - 'justkeyvals': lambda f: MemoryCallback(PrintJustKeyVals(f), 64), + 'justkeys': lambda f: KeysOnlyCallback(f), + 'justkeyvals': lambda f: KeyValsOnlyCallback(f), 'memory': lambda f: MemoryCallback(PrintAllKeys(f, options.bytes, options.largest), 64), 'protocol': lambda f: ProtocolCallback(f) }[options.command](f) diff --git a/rdbtools/memprofiler.py b/rdbtools/memprofiler.py index 02321d1..540d697 100644 --- a/rdbtools/memprofiler.py +++ b/rdbtools/memprofiler.py @@ -110,15 +110,7 @@ def __init__(self, out): def next_record(self, record): self._out.write("%s\n" % encode_key(record.key)) - -class PrintJustKeyVals(object): - def __init__(self, out): - self._out = out - - def next_record(self, record): - self._out.write("%s %s\n" % (encode_key(record.key), encode_value(record.value))) - class MemoryCallback(RdbCallback): '''Calculates the memory used if this rdb file were loaded into RAM From eca004b0dae79992d5dc6646542904d2efd477ba Mon Sep 17 00:00:00 2001 From: oranagra Date: Sun, 28 Aug 2016 07:44:11 +0300 Subject: [PATCH 24/41] fix issue #70, bug introduced in recent memory profiler improvements --- rdbtools/memprofiler.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rdbtools/memprofiler.py b/rdbtools/memprofiler.py index 540d697..9f490a2 100644 --- a/rdbtools/memprofiler.py +++ b/rdbtools/memprofiler.py @@ -88,6 +88,8 @@ def __init__(self, out, bytes, largest): self._heap = [] def next_record(self, record) : + if record.key is None: + return # some records are not keys (e.g. dict) if self._largest is None: if self._bytes is None or record.bytes >= int(self._bytes): self._out.write("%d,%s,%s,%d,%s,%d,%d\n" % (record.database, record.type, encode_key(record.key), From 954124bbbdcb8e3f0db6ebdafeb2133b06f50c65 Mon Sep 17 00:00:00 2001 From: Michal Humpula Date: Thu, 11 Aug 2016 22:50:08 +0200 Subject: [PATCH 25/41] add missing object inheritance --- tests/memprofiler_tests.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/memprofiler_tests.py b/tests/memprofiler_tests.py index 5a083f1..6e6d248 100644 --- a/tests/memprofiler_tests.py +++ b/tests/memprofiler_tests.py @@ -4,7 +4,7 @@ from rdbtools import MemoryCallback import os -class Stats(): +class Stats(object): def __init__(self): self.records = {} @@ -25,5 +25,4 @@ def setUp(self): def test_len_largest_element(self): stats = get_stats('ziplist_that_compresses_easily.rdb') self.assertEqual(stats['ziplist_compresses_easily'].len_largest_element, 36, "Length of largest element does not match") - - \ No newline at end of file + From 3470263b725ce651894f1331bed8133f261a42fc Mon Sep 17 00:00:00 2001 From: Michal Humpula Date: Thu, 11 Aug 2016 22:52:52 +0200 Subject: [PATCH 26/41] make sure int division results in int --- rdbtools/parser.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/rdbtools/parser.py b/rdbtools/parser.py index 267e973..40c7463 100644 --- a/rdbtools/parser.py +++ b/rdbtools/parser.py @@ -587,7 +587,7 @@ def read_zset_from_ziplist(self, f) : num_entries = read_unsigned_short(buff) if (num_entries % 2) : raise Exception('read_zset_from_ziplist', "Expected even number of elements, but found %d for key %s" % (num_entries, self._key)) - num_entries = num_entries /2 + num_entries = num_entries // 2 self._callback.start_sorted_set(self._key, num_entries, self._expiry, info={'encoding':'ziplist', 'sizeof_value':len(raw_string)}) for x in xrange(0, num_entries) : member = self.read_ziplist_entry(buff) @@ -608,7 +608,7 @@ def read_hash_from_ziplist(self, f) : num_entries = read_unsigned_short(buff) if (num_entries % 2) : raise Exception('read_hash_from_ziplist', "Expected even number of elements, but found %d for key %s" % (num_entries, self._key)) - num_entries = num_entries /2 + num_entries = num_entries // 2 self._callback.start_hash(self._key, num_entries, self._expiry, info={'encoding':'ziplist', 'sizeof_value':len(raw_string)}) for x in xrange(0, num_entries) : field = self.read_ziplist_entry(buff) @@ -794,7 +794,9 @@ def ntohl(f) : return new_val def to_datetime(usecs_since_epoch): - seconds_since_epoch = usecs_since_epoch / 1000000 + seconds_since_epoch = usecs_since_epoch // 1000000 + if seconds_since_epoch > 221925052800 : + seconds_since_epoch = 221925052800 useconds = usecs_since_epoch % 1000000 dt = datetime.datetime.utcfromtimestamp(seconds_since_epoch) delta = datetime.timedelta(microseconds = useconds) From d7965253799df9850b27a8c8106e89c303041e15 Mon Sep 17 00:00:00 2001 From: Michal Humpula Date: Thu, 11 Aug 2016 22:54:25 +0200 Subject: [PATCH 27/41] element_length: remove dependency on python long --- rdbtools/memprofiler.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/rdbtools/memprofiler.py b/rdbtools/memprofiler.py index 9f490a2..844c134 100644 --- a/rdbtools/memprofiler.py +++ b/rdbtools/memprofiler.py @@ -475,12 +475,15 @@ def zset_random_level(self): return level else: return ZSKIPLIST_MAXLEVEL - + +MAXINT = 2**63 - 1 + def element_length(element): if isinstance(element, int): - return 8 - if isinstance(element, long): - return 16 + if element < - MAXINT - 1 or element > MAXINT: + return 16 + else: + return 8 else: return len(element) From e7b9e27410c0131d57d9007094bec26141712ab9 Mon Sep 17 00:00:00 2001 From: Michal Humpula Date: Wed, 19 Oct 2016 19:50:00 +0200 Subject: [PATCH 28/41] python2 add compatible range from python3 --- rdbtools/compat.py | 8 ++++++++ rdbtools/parser.py | 27 ++++++++++++++------------- tests/create_test_rdb.py | 11 ++++++----- tests/parser_tests.py | 5 +++-- 4 files changed, 31 insertions(+), 20 deletions(-) create mode 100644 rdbtools/compat.py diff --git a/rdbtools/compat.py b/rdbtools/compat.py new file mode 100644 index 0000000..99eb5ea --- /dev/null +++ b/rdbtools/compat.py @@ -0,0 +1,8 @@ +# python2->3 compat + +try: + xrange + range = xrange +except NameError: + range = range + diff --git a/rdbtools/parser.py b/rdbtools/parser.py index 40c7463..a1b98c8 100644 --- a/rdbtools/parser.py +++ b/rdbtools/parser.py @@ -3,6 +3,7 @@ import sys import datetime import re +from .compat import range try: try: @@ -428,7 +429,7 @@ def read_object(self, f, enc_type) : # and the last string is the tail of the list length = self.read_length(f) self._callback.start_list(self._key, self._expiry, info={'encoding':'linkedlist' }) - for count in xrange(0, length) : + for count in range(0, length) : val = self.read_string(f) self._callback.rpush(self._key, val) self._callback.end_list(self._key, info={'encoding':'linkedlist' }) @@ -438,14 +439,14 @@ def read_object(self, f, enc_type) : # Note that the order of strings is non-deterministic length = self.read_length(f) self._callback.start_set(self._key, length, self._expiry, info={'encoding':'hashtable'}) - for count in xrange(0, length) : + for count in range(0, length) : val = self.read_string(f) self._callback.sadd(self._key, val) self._callback.end_set(self._key) elif enc_type == REDIS_RDB_TYPE_ZSET : length = self.read_length(f) self._callback.start_sorted_set(self._key, length, self._expiry, info={'encoding':'skiplist'}) - for count in xrange(0, length) : + for count in range(0, length) : val = self.read_string(f) score = self.read_float(f) self._callback.zadd(self._key, score, val) @@ -453,7 +454,7 @@ def read_object(self, f, enc_type) : elif enc_type == REDIS_RDB_TYPE_HASH : length = self.read_length(f) self._callback.start_hash(self._key, length, self._expiry, info={'encoding':'hashtable'}) - for count in xrange(0, length) : + for count in range(0, length) : field = self.read_string(f) value = self.read_string(f) self._callback.hset(self._key, field, value) @@ -524,7 +525,7 @@ def skip_object(self, f, enc_type): skip_strings = self.read_length(f) else : raise Exception('skip_object', 'Invalid object type %d for key %s' % (enc_type, self._key)) - for x in xrange(0, skip_strings): + for x in range(0, skip_strings): self.skip_string(f) @@ -534,7 +535,7 @@ def read_intset(self, f) : encoding = read_unsigned_int(buff) num_entries = read_unsigned_int(buff) self._callback.start_set(self._key, num_entries, self._expiry, info={'encoding':'intset', 'sizeof_value':len(raw_string)}) - for x in xrange(0, num_entries) : + for x in range(0, num_entries) : if encoding == 8 : entry = read_signed_long(buff) elif encoding == 4 : @@ -553,7 +554,7 @@ def read_ziplist(self, f) : tail_offset = read_unsigned_int(buff) num_entries = read_unsigned_short(buff) self._callback.start_list(self._key, self._expiry, info={'encoding':'ziplist', 'sizeof_value':len(raw_string)}) - for x in xrange(0, num_entries) : + for x in range(0, num_entries) : val = self.read_ziplist_entry(buff) self._callback.rpush(self._key, val) zlist_end = read_unsigned_char(buff) @@ -565,14 +566,14 @@ def read_list_from_quicklist(self, f): count = self.read_length(f) total_size = 0 self._callback.start_list(self._key, self._expiry, info={'encoding': 'quicklist', 'zips': count}) - for i in xrange(0, count): + for i in range(0, count): raw_string = self.read_string(f) total_size += len(raw_string) buff = StringIO(raw_string) zlbytes = read_unsigned_int(buff) tail_offset = read_unsigned_int(buff) num_entries = read_unsigned_short(buff) - for x in xrange(0, num_entries): + for x in range(0, num_entries): self._callback.rpush(self._key, self.read_ziplist_entry(buff)) zlist_end = read_unsigned_char(buff) if zlist_end != 255: @@ -589,7 +590,7 @@ def read_zset_from_ziplist(self, f) : raise Exception('read_zset_from_ziplist', "Expected even number of elements, but found %d for key %s" % (num_entries, self._key)) num_entries = num_entries // 2 self._callback.start_sorted_set(self._key, num_entries, self._expiry, info={'encoding':'ziplist', 'sizeof_value':len(raw_string)}) - for x in xrange(0, num_entries) : + for x in range(0, num_entries) : member = self.read_ziplist_entry(buff) score = self.read_ziplist_entry(buff) if isinstance(score, str) : @@ -610,7 +611,7 @@ def read_hash_from_ziplist(self, f) : raise Exception('read_hash_from_ziplist', "Expected even number of elements, but found %d for key %s" % (num_entries, self._key)) num_entries = num_entries // 2 self._callback.start_hash(self._key, num_entries, self._expiry, info={'encoding':'ziplist', 'sizeof_value':len(raw_string)}) - for x in xrange(0, num_entries) : + for x in range(0, num_entries) : field = self.read_ziplist_entry(buff) value = self.read_ziplist_entry(buff) self._callback.hset(self._key, field, value) @@ -759,7 +760,7 @@ def lzf_decompress(self, compressed, expected_length): raise Exception('lzf_decompress', 'ctrl should be a number %s for key %s' % (str(ctrl), self._key)) in_index = in_index + 1 if ctrl < 32 : - for x in xrange(0, ctrl + 1) : + for x in range(0, ctrl + 1) : out_stream.append(in_stream[in_index]) #sys.stdout.write(chr(in_stream[in_index])) in_index = in_index + 1 @@ -772,7 +773,7 @@ def lzf_decompress(self, compressed, expected_length): ref = out_index - ((ctrl & 0x1f) << 8) - in_stream[in_index] - 1 in_index = in_index + 1 - for x in xrange(0, length + 2) : + for x in range(0, length + 2) : out_stream.append(out_stream[ref]) ref = ref + 1 out_index = out_index + 1 diff --git a/tests/create_test_rdb.py b/tests/create_test_rdb.py index 1367573..88d967f 100644 --- a/tests/create_test_rdb.py +++ b/tests/create_test_rdb.py @@ -3,6 +3,7 @@ import string import shutil import os +from rdbtools.compat import range r = redis.StrictRedis() r2 = redis.StrictRedis(db=2) @@ -92,13 +93,13 @@ def zipmap_with_big_values(): def dictionary() : num_entries = 1000 - for x in xrange(0, num_entries) : + for x in range(0, num_entries) : r.hset("force_dictionary", random_string(50, x), random_string(50, x + num_entries)) def ziplist_that_compresses_easily() : for length in (6, 12, 18, 24, 30, 36) : - r.rpush("ziplist_compresses_easily", ("".join("a" for x in xrange(length)))) - + r.rpush("ziplist_compresses_easily", ("".join("a" for x in range(length)))) + def ziplist_that_doesnt_compress() : r.rpush("ziplist_doesnt_compress", "aj2410") r.rpush("ziplist_doesnt_compress", "cc953a17a8e096e76a44169ad3f9ac87c5f8248a403274416179aa9fbd852344") @@ -131,7 +132,7 @@ def ziplist_with_integers() : def linkedlist() : num_entries = 1000 - for x in xrange(0, num_entries) : + for x in range(0, num_entries) : r.rpush("force_linkedlist", random_string(50, x)) def intset_16() : @@ -164,7 +165,7 @@ def sorted_set_as_ziplist() : def regular_sorted_set() : num_entries = 500 - for x in xrange(0, num_entries) : + for x in range(0, num_entries) : r.zadd("force_sorted_set", float(x) / 100, random_string(50, x)) def random_string(length, seed) : diff --git a/tests/parser_tests.py b/tests/parser_tests.py index 1cda6f9..707044e 100644 --- a/tests/parser_tests.py +++ b/tests/parser_tests.py @@ -2,6 +2,7 @@ import os import math from rdbtools import RdbCallback, RdbParser +from rdbtools.compat import range class RedisParserTestCase(unittest.TestCase): def setUp(self): @@ -99,8 +100,8 @@ def test_ziplist_that_compresses_easily(self): r = load_rdb('ziplist_that_compresses_easily.rdb') self.assertEquals(r.lengths[0]["ziplist_compresses_easily"], 6) for idx, length in enumerate([6, 12, 18, 24, 30, 36]) : - self.assertEquals(("".join("a" for x in xrange(length))), r.databases[0]["ziplist_compresses_easily"][idx]) - + self.assertEquals(("".join("a" for x in range(length))), r.databases[0]["ziplist_compresses_easily"][idx]) + def test_ziplist_that_doesnt_compress(self): r = load_rdb('ziplist_that_doesnt_compress.rdb') self.assertEquals(r.lengths[0]["ziplist_doesnt_compress"], 2) From 07eaec7955611b36a6918655df7954d23d4f61a2 Mon Sep 17 00:00:00 2001 From: Michal Humpula Date: Thu, 11 Aug 2016 23:27:45 +0200 Subject: [PATCH 29/41] python3 add compatible isinteger test --- rdbtools/callbacks.py | 3 ++- rdbtools/compat.py | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/rdbtools/callbacks.py b/rdbtools/callbacks.py index 3fc3f49..7a18fd3 100644 --- a/rdbtools/callbacks.py +++ b/rdbtools/callbacks.py @@ -4,6 +4,7 @@ import sys import struct from rdbtools.parser import RdbCallback, RdbParser +from .compat import isinteger ESCAPE = re.compile(ur'[\x00-\x1f\\"\b\f\n\r\t\u2028\u2029]') ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])') @@ -73,7 +74,7 @@ def _encode(s, quote_numbers = True): qn = '"' else: qn = '' - if isinstance(s, int) or isinstance(s, long): + if isinteger(s): return qn + str(s) + qn elif isinstance(s, float): if s != s: diff --git a/rdbtools/compat.py b/rdbtools/compat.py index 99eb5ea..47ade67 100644 --- a/rdbtools/compat.py +++ b/rdbtools/compat.py @@ -6,3 +6,11 @@ except NameError: range = range +try: + long + def isinteger(n): + return isinstance(n, int) or isinstance(n, long) +except NameError: + def isinteger(n): + return isinstance(n, int) + From aec81eb2f6a794b7da6305447763fcb4b569d5e9 Mon Sep 17 00:00:00 2001 From: Michal Humpula Date: Wed, 19 Oct 2016 20:04:59 +0200 Subject: [PATCH 30/41] migrate strings to bytes where apropriate --- rdbtools/callbacks.py | 2 +- rdbtools/parser.py | 21 ++--- tests/memprofiler_tests.py | 2 +- tests/parser_tests.py | 156 ++++++++++++++++++------------------- 4 files changed, 91 insertions(+), 90 deletions(-) diff --git a/rdbtools/callbacks.py b/rdbtools/callbacks.py index 7a18fd3..97e94dd 100644 --- a/rdbtools/callbacks.py +++ b/rdbtools/callbacks.py @@ -24,7 +24,7 @@ ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,)) def _floatconstants(): - _BYTES = '7FF80000000000007FF0000000000000'.decode('hex') + _BYTES = b'\x7F\xF8\x00\x00\x00\x00\x00\x00\x7F\xF0\x00\x00\x00\x00\x00\x00' # The struct module in Python 2.4 would get frexp() out of range here # when an endian is specified in the format string. Fixed in Python 2.5+ if sys.byteorder != 'big': diff --git a/rdbtools/parser.py b/rdbtools/parser.py index a1b98c8..07d8180 100644 --- a/rdbtools/parser.py +++ b/rdbtools/parser.py @@ -4,6 +4,7 @@ import datetime import re from .compat import range +from io import BytesIO try: try: @@ -531,7 +532,7 @@ def skip_object(self, f, enc_type): def read_intset(self, f) : raw_string = self.read_string(f) - buff = StringIO(raw_string) + buff = BytesIO(raw_string) encoding = read_unsigned_int(buff) num_entries = read_unsigned_int(buff) self._callback.start_set(self._key, num_entries, self._expiry, info={'encoding':'intset', 'sizeof_value':len(raw_string)}) @@ -549,7 +550,7 @@ def read_intset(self, f) : def read_ziplist(self, f) : raw_string = self.read_string(f) - buff = StringIO(raw_string) + buff = BytesIO(raw_string) zlbytes = read_unsigned_int(buff) tail_offset = read_unsigned_int(buff) num_entries = read_unsigned_short(buff) @@ -582,7 +583,7 @@ def read_list_from_quicklist(self, f): def read_zset_from_ziplist(self, f) : raw_string = self.read_string(f) - buff = StringIO(raw_string) + buff = BytesIO(raw_string) zlbytes = read_unsigned_int(buff) tail_offset = read_unsigned_int(buff) num_entries = read_unsigned_short(buff) @@ -593,7 +594,7 @@ def read_zset_from_ziplist(self, f) : for x in range(0, num_entries) : member = self.read_ziplist_entry(buff) score = self.read_ziplist_entry(buff) - if isinstance(score, str) : + if isinstance(score, bytes) : score = float(score) self._callback.zadd(self._key, score, member) zlist_end = read_unsigned_char(buff) @@ -603,7 +604,7 @@ def read_zset_from_ziplist(self, f) : def read_hash_from_ziplist(self, f) : raw_string = self.read_string(f) - buff = StringIO(raw_string) + buff = BytesIO(raw_string) zlbytes = read_unsigned_int(buff) tail_offset = read_unsigned_int(buff) num_entries = read_unsigned_short(buff) @@ -687,7 +688,7 @@ def read_zipmap_next_length(self, f) : return None def verify_magic_string(self, magic_string) : - if magic_string != 'REDIS' : + if magic_string != b'REDIS' : raise Exception('verify_magic_string', 'Invalid File Format') def verify_version(self, version_str) : @@ -711,7 +712,7 @@ def init_filter(self, filters): raise Exception('init_filter', 'invalid value for dbs in filter %s' %filters['dbs']) if not ('keys' in filters and filters['keys']): - self._filters['keys'] = re.compile(".*") + self._filters['keys'] = re.compile(b".*") else: self._filters['keys'] = re.compile(filters['keys']) @@ -722,7 +723,7 @@ def init_filter(self, filters): if not 'types' in filters: self._filters['types'] = ('set', 'hash', 'sortedset', 'string', 'list') - elif isinstance(filters['types'], str): + elif isinstance(filters['types'], bytes): self._filters['types'] = (filters['types'], ) elif isinstance(filters['types'], list): self._filters['types'] = [str(x) for x in filters['types']] @@ -779,7 +780,7 @@ def lzf_decompress(self, compressed, expected_length): out_index = out_index + 1 if len(out_stream) != expected_length : raise Exception('lzf_decompress', 'Expected lengths do not match %d != %d for key %s' % (len(out_stream), expected_length, self._key)) - return str(out_stream) + return bytes(out_stream) def skip(f, free): if free : @@ -825,7 +826,7 @@ def read_big_endian_unsigned_int(f): return struct.unpack('>I', f.read(4))[0] def read_24bit_signed_number(f): - s = '0' + f.read(3) + s = b'0' + f.read(3) num = struct.unpack('i', s)[0] return num >> 8 diff --git a/tests/memprofiler_tests.py b/tests/memprofiler_tests.py index 6e6d248..0886d81 100644 --- a/tests/memprofiler_tests.py +++ b/tests/memprofiler_tests.py @@ -24,5 +24,5 @@ def setUp(self): def test_len_largest_element(self): stats = get_stats('ziplist_that_compresses_easily.rdb') - self.assertEqual(stats['ziplist_compresses_easily'].len_largest_element, 36, "Length of largest element does not match") + self.assertEqual(stats[b'ziplist_compresses_easily'].len_largest_element, 36, "Length of largest element does not match") diff --git a/tests/parser_tests.py b/tests/parser_tests.py index 707044e..9bd705c 100644 --- a/tests/parser_tests.py +++ b/tests/parser_tests.py @@ -21,12 +21,12 @@ def test_multiple_databases(self): r = load_rdb('multiple_databases.rdb') self.assert_(len(r.databases), 2) self.assert_(1 not in r.databases) - self.assertEquals(r.databases[0]["key_in_zeroth_database"], "zero") - self.assertEquals(r.databases[2]["key_in_second_database"], "second") - + self.assertEquals(r.databases[0][b"key_in_zeroth_database"], b"zero") + self.assertEquals(r.databases[2][b"key_in_second_database"], b"second") + def test_keys_with_expiry(self): r = load_rdb('keys_with_expiry.rdb') - expiry = r.expiry[0]['expires_ms_precision'] + expiry = r.expiry[0][b'expires_ms_precision'] self.assertEquals(expiry.year, 2022) self.assertEquals(expiry.month, 12) self.assertEquals(expiry.day, 25) @@ -37,33 +37,33 @@ def test_keys_with_expiry(self): def test_integer_keys(self): r = load_rdb('integer_keys.rdb') - self.assertEquals(r.databases[0][125], "Positive 8 bit integer") - self.assertEquals(r.databases[0][0xABAB], "Positive 16 bit integer") - self.assertEquals(r.databases[0][0x0AEDD325], "Positive 32 bit integer") - + self.assertEquals(r.databases[0][125], b"Positive 8 bit integer") + self.assertEquals(r.databases[0][0xABAB], b"Positive 16 bit integer") + self.assertEquals(r.databases[0][0x0AEDD325], b"Positive 32 bit integer") + def test_negative_integer_keys(self): r = load_rdb('integer_keys.rdb') - self.assertEquals(r.databases[0][-123], "Negative 8 bit integer") - self.assertEquals(r.databases[0][-0x7325], "Negative 16 bit integer") - self.assertEquals(r.databases[0][-0x0AEDD325], "Negative 32 bit integer") - + self.assertEquals(r.databases[0][-123], b"Negative 8 bit integer") + self.assertEquals(r.databases[0][-0x7325], b"Negative 16 bit integer") + self.assertEquals(r.databases[0][-0x0AEDD325], b"Negative 32 bit integer") + def test_string_key_with_compression(self): r = load_rdb('easily_compressible_string_key.rdb') - key = "".join('a' for x in range(0, 200)) - value = "Key that redis should compress easily" + key = b"".join(b'a' for x in range(0, 200)) + value = b"Key that redis should compress easily" self.assertEquals(r.databases[0][key], value) def test_zipmap_thats_compresses_easily(self): r = load_rdb('zipmap_that_compresses_easily.rdb') - self.assertEquals(r.databases[0]["zipmap_compresses_easily"]["a"], "aa") - self.assertEquals(r.databases[0]["zipmap_compresses_easily"]["aa"], "aaaa") - self.assertEquals(r.databases[0]["zipmap_compresses_easily"]["aaaaa"], "aaaaaaaaaaaaaa") - + self.assertEquals(r.databases[0][b"zipmap_compresses_easily"][b"a"], b"aa") + self.assertEquals(r.databases[0][b"zipmap_compresses_easily"][b"aa"], b"aaaa") + self.assertEquals(r.databases[0][b"zipmap_compresses_easily"][b"aaaaa"], b"aaaaaaaaaaaaaa") + def test_zipmap_that_doesnt_compress(self): r = load_rdb('zipmap_that_doesnt_compress.rdb') - self.assertEquals(r.databases[0]["zimap_doesnt_compress"]["MKD1G6"], 2) - self.assertEquals(r.databases[0]["zimap_doesnt_compress"]["YNNXK"], "F7TI") - + self.assertEquals(r.databases[0][b"zimap_doesnt_compress"][b"MKD1G6"], 2) + self.assertEquals(r.databases[0][b"zimap_doesnt_compress"][b"YNNXK"], b"F7TI") + def test_zipmap_with_big_values(self): ''' See issue https://github.com/sripathikrishnan/redis-rdb-tools/issues/2 Values with length around 253/254/255 bytes are treated specially in the parser @@ -75,40 +75,40 @@ def test_zipmap_with_big_values(self): ziplist with a length encoding of 5 bytes. ''' r = load_rdb('zipmap_with_big_values.rdb') - self.assertEquals(len(r.databases[0]["zipmap_with_big_values"]["253bytes"]), 253) - self.assertEquals(len(r.databases[0]["zipmap_with_big_values"]["254bytes"]), 254) - self.assertEquals(len(r.databases[0]["zipmap_with_big_values"]["255bytes"]), 255) - self.assertEquals(len(r.databases[0]["zipmap_with_big_values"]["300bytes"]), 300) - self.assertEquals(len(r.databases[0]["zipmap_with_big_values"]["20kbytes"]), 20000) - + self.assertEquals(len(r.databases[0][b"zipmap_with_big_values"][b"253bytes"]), 253) + self.assertEquals(len(r.databases[0][b"zipmap_with_big_values"][b"254bytes"]), 254) + self.assertEquals(len(r.databases[0][b"zipmap_with_big_values"][b"255bytes"]), 255) + self.assertEquals(len(r.databases[0][b"zipmap_with_big_values"][b"300bytes"]), 300) + self.assertEquals(len(r.databases[0][b"zipmap_with_big_values"][b"20kbytes"]), 20000) + def test_hash_as_ziplist(self): '''In redis dump version = 4, hashmaps are stored as ziplists''' r = load_rdb('hash_as_ziplist.rdb') - self.assertEquals(r.databases[0]["zipmap_compresses_easily"]["a"], "aa") - self.assertEquals(r.databases[0]["zipmap_compresses_easily"]["aa"], "aaaa") - self.assertEquals(r.databases[0]["zipmap_compresses_easily"]["aaaaa"], "aaaaaaaaaaaaaa") - + self.assertEquals(r.databases[0][b"zipmap_compresses_easily"][b"a"], b"aa") + self.assertEquals(r.databases[0][b"zipmap_compresses_easily"][b"aa"], b"aaaa") + self.assertEquals(r.databases[0][b"zipmap_compresses_easily"][b"aaaaa"], b"aaaaaaaaaaaaaa") + def test_dictionary(self): r = load_rdb('dictionary.rdb') - self.assertEquals(r.lengths[0]["force_dictionary"], 1000) - self.assertEquals(r.databases[0]["force_dictionary"]["ZMU5WEJDG7KU89AOG5LJT6K7HMNB3DEI43M6EYTJ83VRJ6XNXQ"], - "T63SOS8DQJF0Q0VJEZ0D1IQFCYTIPSBOUIAI9SB0OV57MQR1FI") - self.assertEquals(r.databases[0]["force_dictionary"]["UHS5ESW4HLK8XOGTM39IK1SJEUGVV9WOPK6JYA5QBZSJU84491"], - "6VULTCV52FXJ8MGVSFTZVAGK2JXZMGQ5F8OVJI0X6GEDDR27RZ") - + self.assertEquals(r.lengths[0][b"force_dictionary"], 1000) + self.assertEquals(r.databases[0][b"force_dictionary"][b"ZMU5WEJDG7KU89AOG5LJT6K7HMNB3DEI43M6EYTJ83VRJ6XNXQ"], + b"T63SOS8DQJF0Q0VJEZ0D1IQFCYTIPSBOUIAI9SB0OV57MQR1FI") + self.assertEquals(r.databases[0][b"force_dictionary"][b"UHS5ESW4HLK8XOGTM39IK1SJEUGVV9WOPK6JYA5QBZSJU84491"], + b"6VULTCV52FXJ8MGVSFTZVAGK2JXZMGQ5F8OVJI0X6GEDDR27RZ") + def test_ziplist_that_compresses_easily(self): r = load_rdb('ziplist_that_compresses_easily.rdb') - self.assertEquals(r.lengths[0]["ziplist_compresses_easily"], 6) + self.assertEquals(r.lengths[0][b"ziplist_compresses_easily"], 6) for idx, length in enumerate([6, 12, 18, 24, 30, 36]) : - self.assertEquals(("".join("a" for x in range(length))), r.databases[0]["ziplist_compresses_easily"][idx]) + self.assertEquals((b"".join(b"a" for x in range(length))), r.databases[0][b"ziplist_compresses_easily"][idx]) def test_ziplist_that_doesnt_compress(self): r = load_rdb('ziplist_that_doesnt_compress.rdb') - self.assertEquals(r.lengths[0]["ziplist_doesnt_compress"], 2) - self.assert_("aj2410" in r.databases[0]["ziplist_doesnt_compress"]) - self.assert_("cc953a17a8e096e76a44169ad3f9ac87c5f8248a403274416179aa9fbd852344" - in r.databases[0]["ziplist_doesnt_compress"]) - + self.assertEquals(r.lengths[0][b"ziplist_doesnt_compress"], 2) + self.assert_(b"aj2410" in r.databases[0][b"ziplist_doesnt_compress"]) + self.assert_(b"cc953a17a8e096e76a44169ad3f9ac87c5f8248a403274416179aa9fbd852344" + in r.databases[0][b"ziplist_doesnt_compress"]) + def test_ziplist_with_integers(self): r = load_rdb('ziplist_with_integers.rdb') @@ -118,84 +118,84 @@ def test_ziplist_with_integers(self): expected_numbers += [-2, 13, 25, -61, 63, 16380, -16000, 65535, -65523, 4194304, 0x7fffffffffffffff] - self.assertEquals(r.lengths[0]["ziplist_with_integers"], len(expected_numbers)) + self.assertEquals(r.lengths[0][b"ziplist_with_integers"], len(expected_numbers)) for num in expected_numbers : - self.assert_(num in r.databases[0]["ziplist_with_integers"], "Cannot find %d" % num) + self.assert_(num in r.databases[0][b"ziplist_with_integers"], "Cannot find %d" % num) def test_linkedlist(self): r = load_rdb('linkedlist.rdb') - self.assertEquals(r.lengths[0]["force_linkedlist"], 1000) - self.assert_("JYY4GIFI0ETHKP4VAJF5333082J4R1UPNPLE329YT0EYPGHSJQ" in r.databases[0]["force_linkedlist"]) - self.assert_("TKBXHJOX9Q99ICF4V78XTCA2Y1UYW6ERL35JCIL1O0KSGXS58S" in r.databases[0]["force_linkedlist"]) + self.assertEquals(r.lengths[0][b"force_linkedlist"], 1000) + self.assert_(b"JYY4GIFI0ETHKP4VAJF5333082J4R1UPNPLE329YT0EYPGHSJQ" in r.databases[0][b"force_linkedlist"]) + self.assert_(b"TKBXHJOX9Q99ICF4V78XTCA2Y1UYW6ERL35JCIL1O0KSGXS58S" in r.databases[0][b"force_linkedlist"]) def test_intset_16(self): r = load_rdb('intset_16.rdb') - self.assertEquals(r.lengths[0]["intset_16"], 3) + self.assertEquals(r.lengths[0][b"intset_16"], 3) for num in (0x7ffe, 0x7ffd, 0x7ffc) : - self.assert_(num in r.databases[0]["intset_16"]) + self.assert_(num in r.databases[0][b"intset_16"]) def test_intset_32(self): r = load_rdb('intset_32.rdb') - self.assertEquals(r.lengths[0]["intset_32"], 3) + self.assertEquals(r.lengths[0][b"intset_32"], 3) for num in (0x7ffefffe, 0x7ffefffd, 0x7ffefffc) : - self.assert_(num in r.databases[0]["intset_32"]) + self.assert_(num in r.databases[0][b"intset_32"]) def test_intset_64(self): r = load_rdb('intset_64.rdb') - self.assertEquals(r.lengths[0]["intset_64"], 3) + self.assertEquals(r.lengths[0][b"intset_64"], 3) for num in (0x7ffefffefffefffe, 0x7ffefffefffefffd, 0x7ffefffefffefffc) : - self.assert_(num in r.databases[0]["intset_64"]) + self.assert_(num in r.databases[0][b"intset_64"]) def test_regular_set(self): r = load_rdb('regular_set.rdb') - self.assertEquals(r.lengths[0]["regular_set"], 6) - for member in ("alpha", "beta", "gamma", "delta", "phi", "kappa") : - self.assert_(member in r.databases[0]["regular_set"], msg=('%s missing' % member)) + self.assertEquals(r.lengths[0][b"regular_set"], 6) + for member in (b"alpha", b"beta", b"gamma", b"delta", b"phi", b"kappa") : + self.assert_(member in r.databases[0][b"regular_set"], msg=('%s missing' % member)) def test_sorted_set_as_ziplist(self): r = load_rdb('sorted_set_as_ziplist.rdb') - self.assertEquals(r.lengths[0]["sorted_set_as_ziplist"], 3) - zset = r.databases[0]["sorted_set_as_ziplist"] - self.assert_(floateq(zset['8b6ba6718a786daefa69438148361901'], 1)) - self.assert_(floateq(zset['cb7a24bb7528f934b841b34c3a73e0c7'], 2.37)) - self.assert_(floateq(zset['523af537946b79c4f8369ed39ba78605'], 3.423)) + self.assertEquals(r.lengths[0][b"sorted_set_as_ziplist"], 3) + zset = r.databases[0][b"sorted_set_as_ziplist"] + self.assert_(floateq(zset[b'8b6ba6718a786daefa69438148361901'], 1)) + self.assert_(floateq(zset[b'cb7a24bb7528f934b841b34c3a73e0c7'], 2.37)) + self.assert_(floateq(zset[b'523af537946b79c4f8369ed39ba78605'], 3.423)) def test_filtering_by_keys(self): r = load_rdb('parser_filters.rdb', filters={"keys":"k[0-9]"}) - self.assertEquals(r.databases[0]['k1'], "ssssssss") - self.assertEquals(r.databases[0]['k3'], "wwwwwwww") + self.assertEquals(r.databases[0][b'k1'], b"ssssssss") + self.assertEquals(r.databases[0][b'k3'], b"wwwwwwww") self.assertEquals(len(r.databases[0]), 2) def test_filtering_by_type(self): r = load_rdb('parser_filters.rdb', filters={"types":["sortedset"]}) - self.assert_('z1' in r.databases[0]) - self.assert_('z2' in r.databases[0]) - self.assert_('z3' in r.databases[0]) - self.assert_('z4' in r.databases[0]) + self.assert_(b'z1' in r.databases[0]) + self.assert_(b'z2' in r.databases[0]) + self.assert_(b'z3' in r.databases[0]) + self.assert_(b'z4' in r.databases[0]) self.assertEquals(len(r.databases[0]), 4) def test_filtering_by_database(self): r = load_rdb('multiple_databases.rdb', filters={"dbs":[2]}) - self.assert_('key_in_zeroth_database' not in r.databases[0]) - self.assert_('key_in_second_database' in r.databases[2]) + self.assert_(b'key_in_zeroth_database' not in r.databases[0]) + self.assert_(b'key_in_second_database' in r.databases[2]) self.assertEquals(len(r.databases[0]), 0) self.assertEquals(len(r.databases[2]), 1) def test_rdb_version_5_with_checksum(self): r = load_rdb('rdb_version_5_with_checksum.rdb') - self.assertEquals(r.databases[0]['abcd'], 'efgh') - self.assertEquals(r.databases[0]['foo'], 'bar') - self.assertEquals(r.databases[0]['bar'], 'baz') - self.assertEquals(r.databases[0]['abcdef'], 'abcdef') - self.assertEquals(r.databases[0]['longerstring'], 'thisisalongerstring.idontknowwhatitmeans') + self.assertEquals(r.databases[0][b'abcd'], b'efgh') + self.assertEquals(r.databases[0][b'foo'], b'bar') + self.assertEquals(r.databases[0][b'bar'], b'baz') + self.assertEquals(r.databases[0][b'abcdef'], b'abcdef') + self.assertEquals(r.databases[0][b'longerstring'], b'thisisalongerstring.idontknowwhatitmeans') def test_multiple_databases_stream(self): r = load_rdb_stream('multiple_databases.rdb') self.assert_(len(r.databases), 2) self.assert_(1 not in r.databases) - self.assertEquals(r.databases[0]["key_in_zeroth_database"], "zero") - self.assertEquals(r.databases[2]["key_in_second_database"], "second") + self.assertEquals(r.databases[0][b"key_in_zeroth_database"], b"zero") + self.assertEquals(r.databases[2][b"key_in_second_database"], b"second") def floateq(f1, f2) : return math.fabs(f1 - f2) < 0.00001 From 9a344494de8ee6312681ac8d27b8804f4f992487 Mon Sep 17 00:00:00 2001 From: Michal Humpula Date: Wed, 19 Oct 2016 20:17:45 +0200 Subject: [PATCH 31/41] python3 compat regexps --- rdbtools/callbacks.py | 11 +++++++++-- rdbtools/compat.py | 9 +++++++++ rdbtools/parser.py | 18 +++++++++++++----- 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/rdbtools/callbacks.py b/rdbtools/callbacks.py index 97e94dd..eeb19df 100644 --- a/rdbtools/callbacks.py +++ b/rdbtools/callbacks.py @@ -6,8 +6,15 @@ from rdbtools.parser import RdbCallback, RdbParser from .compat import isinteger -ESCAPE = re.compile(ur'[\x00-\x1f\\"\b\f\n\r\t\u2028\u2029]') -ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])') +if sys.version_info < (3,): + import codecs + def u(x): return codecs.unicode_escape_decode(x)[0] +else: + def u(x): return x + +ESCAPE = re.compile(u(r'[\x00-\x1f\\"\b\f\n\r\t\u2028\u2029]')) +ESCAPE_ASCII = re.compile(br'([\\"]|[^\ -~])') + HAS_UTF8 = re.compile(r'[\x80-\xff]') ESCAPE_DCT = { '\\': '\\\\', diff --git a/rdbtools/compat.py b/rdbtools/compat.py index 47ade67..dc72cef 100644 --- a/rdbtools/compat.py +++ b/rdbtools/compat.py @@ -1,5 +1,7 @@ # python2->3 compat +import sys, re + try: xrange range = xrange @@ -14,3 +16,10 @@ def isinteger(n): def isinteger(n): return isinstance(n, int) +if sys.version_info < (3,): + def str2regexp(pattern): + return re.compile(pattern) +else: + def str2regexp(pattern): + return re.compile(pattern.encode('utf-8')) + diff --git a/rdbtools/parser.py b/rdbtools/parser.py index 07d8180..b027d7e 100644 --- a/rdbtools/parser.py +++ b/rdbtools/parser.py @@ -3,7 +3,7 @@ import sys import datetime import re -from .compat import range +from .compat import range, str2regexp from io import BytesIO try: @@ -714,12 +714,12 @@ def init_filter(self, filters): if not ('keys' in filters and filters['keys']): self._filters['keys'] = re.compile(b".*") else: - self._filters['keys'] = re.compile(filters['keys']) + self._filters['keys'] = str2regexp(filters['keys']) if not ('not_keys' in filters and filters['not_keys']): self._filters['not_keys'] = None else: - self._filters['not_keys'] = re.compile(filters['not_keys']) + self._filters['not_keys'] = str2regexp(filters['not_keys']) if not 'types' in filters: self._filters['types'] = ('set', 'hash', 'sortedset', 'string', 'list') @@ -731,11 +731,19 @@ def init_filter(self, filters): raise Exception('init_filter', 'invalid value for types in filter %s' %filters['types']) def matches_filter(self, db_number, key=None, data_type=None): + + if isinstance(key, bytes): + key_to_match = key + elif isinstance(key, str): # bytes key in python2 + key_to_match = key + else: + key_to_match = str(key).encode('utf-8') + if self._filters['dbs'] and (not db_number in self._filters['dbs']): return False - if key and self._filters['not_keys'] and (self._filters['not_keys'].match(str(key))): + if key and self._filters['not_keys'] and (self._filters['not_keys'].match(key_to_match)): return False - if key and (not self._filters['keys'].match(str(key))): + if key and (not self._filters['keys'].match(key_to_match)): return False if data_type is not None and (not self.get_logical_type(data_type) in self._filters['types']): From 314f15899cb6ff374556cb315cef138f77920bdc Mon Sep 17 00:00:00 2001 From: Michal Humpula Date: Wed, 19 Oct 2016 20:18:38 +0200 Subject: [PATCH 32/41] decalare python3 compatibility --- .travis.yml | 2 ++ README.md | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index da8b5d3..dc0e4d1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,6 +2,8 @@ language: python python: - "2.6" - "2.7" + - "3.4" + - "3.5" script: python run_tests diff --git a/README.md b/README.md index fcf2841..00e1cde 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,7 @@ Rdbtools is written in Python, though there are similar projects in other langua Pre-Requisites : -1. python 2.x and pip. -2. redis-py is optional and only needed to run test cases. +1. redis-py is optional and only needed to run test cases. To install from PyPI (recommended) : From 9ec8a527a97f4dff4f80783fe90319bf4da294af Mon Sep 17 00:00:00 2001 From: Michal Humpula Date: Thu, 20 Oct 2016 18:31:43 +0200 Subject: [PATCH 33/41] remove obsolete requirements.txt --- requirements.txt | 2 -- setup.py | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) delete mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index a4a97ff..0000000 --- a/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -redis==2.4.12 -wsgiref==0.1.2 diff --git a/setup.py b/setup.py index 74b376d..b596c77 100644 --- a/setup.py +++ b/setup.py @@ -33,6 +33,7 @@ 'packages' : ['rdbtools', 'rdbtools.cli'], 'package_data' : {'rdbtools.cli': ['*.template']}, 'test_suite' : 'tests.all_tests', + 'tests_require': ['redis'], 'entry_points' : { 'console_scripts' : [ 'rdb = rdbtools.cli.rdb:main', From 9f521469acaa1776c6df61f8c12c9556cce8065b Mon Sep 17 00:00:00 2001 From: oranagra Date: Wed, 26 Oct 2016 21:05:33 +0300 Subject: [PATCH 34/41] turns we do use redis-py (StrictRedis), adding depencency back --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b596c77..e2d7fb8 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ 'packages' : ['rdbtools', 'rdbtools.cli'], 'package_data' : {'rdbtools.cli': ['*.template']}, 'test_suite' : 'tests.all_tests', - 'tests_require': ['redis'], + 'install_requires': ['redis'], 'entry_points' : { 'console_scripts' : [ 'rdb = rdbtools.cli.rdb:main', From f4c7080cf567160d5f00aa06e493cf45f5e5f583 Mon Sep 17 00:00:00 2001 From: Amotz Getzov Date: Thu, 5 Jan 2017 12:08:23 +0200 Subject: [PATCH 35/41] Fix ProtocolCallback.emit() unicode conversion to be content neutral. --- rdbtools/callbacks.py | 7 ++++--- tests/__init__.py | 2 ++ tests/callbacks_tests.py | 20 ++++++++++++++++++++ 3 files changed, 26 insertions(+), 3 deletions(-) create mode 100644 tests/callbacks_tests.py diff --git a/rdbtools/callbacks.py b/rdbtools/callbacks.py index eeb19df..c702f7f 100644 --- a/rdbtools/callbacks.py +++ b/rdbtools/callbacks.py @@ -411,10 +411,11 @@ def post_expiry(self, key): self.expireat(key, self.get_expiry_seconds(key)) def emit(self, *args): - self._out.write(u"*" + unicode(len(args)) + u"\r\n") + self._out.write("*{}\r\n".format(len(args))) for arg in args: - self._out.write(u"$" + unicode(len(unicode(arg))) + u"\r\n") - self._out.write(unicode(arg) + u"\r\n") + val = str(arg) + self._out.write("${}\r\n".format(len(val))) + self._out.write(val + "\r\n") def start_database(self, db_number): self.reset() diff --git a/tests/__init__.py b/tests/__init__.py index f55c428..5f86dec 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,9 +1,11 @@ import unittest from tests.parser_tests import RedisParserTestCase from tests.memprofiler_tests import MemoryCallbackTestCase +from tests.callbacks_tests import ProtocolTestCase def all_tests(): suite = unittest.TestSuite() suite.addTest(unittest.makeSuite(RedisParserTestCase)) suite.addTest(unittest.makeSuite(MemoryCallbackTestCase)) + suite.addTest(unittest.makeSuite(ProtocolTestCase)) return suite diff --git a/tests/callbacks_tests.py b/tests/callbacks_tests.py new file mode 100644 index 0000000..04db1be --- /dev/null +++ b/tests/callbacks_tests.py @@ -0,0 +1,20 @@ +import unittest +import random +from io import BytesIO +from rdbtools.callbacks import ProtocolCallback + + +class ProtocolTestCase(unittest.TestCase): + def setUp(self): + self._out = BytesIO() + self._callback = ProtocolCallback(self._out) + + def test_emit(self): + utf8_string = '\xd9\xa1\xdf\x82\xe0\xa5\xa9\xe1\xa7\x94\xf0\x91\x8b\xb5' + bcde_non_print = '\x00\x01bcde\r\n\x88\x99' + random_bytes = ''.join(chr(random.randrange(0, 255)) for _ in range(64)) + integer = 46 + self._callback.emit(utf8_string, bcde_non_print, random_bytes, integer) + expected = '\r\n'.join(['*4', '$14', utf8_string, '$10', bcde_non_print, '$64', random_bytes, '$2', '46\r\n']) + result = self._out.getvalue() + self.assertEquals(result, expected) From 481baf7c526025e10e9448267650b97f5eaedc62 Mon Sep 17 00:00:00 2001 From: Amotz Getzov Date: Mon, 13 Feb 2017 15:27:21 +0200 Subject: [PATCH 36/41] Add command line and RdbCallback option for strings escape, and adjust for Python2/3 compat. --- rdbtools/callbacks.py | 243 ++++++++++----------------- rdbtools/cli/rdb.py | 46 +++-- rdbtools/cli/redis_memory_for_key.py | 14 +- rdbtools/encodehelpers.py | 154 +++++++++++++++++ rdbtools/memprofiler.py | 50 +++--- rdbtools/parser.py | 29 +++- tests/__init__.py | 15 +- tests/callbacks_tests.py | 125 ++++++++++++-- tests/dumps/non_ascii_values.rdb | Bin 0 -> 202 bytes tests/parser_tests.py | 3 +- 10 files changed, 453 insertions(+), 226 deletions(-) create mode 100644 rdbtools/encodehelpers.py create mode 100644 tests/dumps/non_ascii_values.rdb diff --git a/rdbtools/callbacks.py b/rdbtools/callbacks.py index c702f7f..860c3c9 100644 --- a/rdbtools/callbacks.py +++ b/rdbtools/callbacks.py @@ -1,123 +1,39 @@ import calendar -import re -from decimal import Decimal -import sys -import struct -from rdbtools.parser import RdbCallback, RdbParser -from .compat import isinteger - -if sys.version_info < (3,): - import codecs - def u(x): return codecs.unicode_escape_decode(x)[0] -else: - def u(x): return x - -ESCAPE = re.compile(u(r'[\x00-\x1f\\"\b\f\n\r\t\u2028\u2029]')) -ESCAPE_ASCII = re.compile(br'([\\"]|[^\ -~])') - -HAS_UTF8 = re.compile(r'[\x80-\xff]') -ESCAPE_DCT = { - '\\': '\\\\', - '"': '\\"', - '\b': '\\b', - '\f': '\\f', - '\n': '\\n', - '\r': '\\r', - '\t': '\\t', - u'\u2028': '\\u2028', - u'\u2029': '\\u2029', -} -for i in range(0x20): - ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,)) - -def _floatconstants(): - _BYTES = b'\x7F\xF8\x00\x00\x00\x00\x00\x00\x7F\xF0\x00\x00\x00\x00\x00\x00' - # The struct module in Python 2.4 would get frexp() out of range here - # when an endian is specified in the format string. Fixed in Python 2.5+ - if sys.byteorder != 'big': - _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1] - nan, inf = struct.unpack('dd', _BYTES) - return nan, inf, -inf - -NaN, PosInf, NegInf = _floatconstants() - -def _encode_basestring(s): - """Return a JSON representation of a Python string""" - if isinstance(s, str) and HAS_UTF8.search(s) is not None: - s = s.decode('utf-8') - def replace(match): - return ESCAPE_DCT[match.group(0)] - return u'"' + ESCAPE.sub(replace, s) + u'"' - -def _encode_basestring_ascii(s): - """Return an ASCII-only JSON representation of a Python string - - """ - try : - if isinstance(s, str) and HAS_UTF8.search(s) is not None: - s = s.decode('utf-8') - except: - pass +import codecs +import json - def replace(match): - s = match.group(0) - try: - return ESCAPE_DCT[s] - except KeyError: - n = ord(s) - if n < 0x10000: - #return '\\u{0:04x}'.format(n) - return '\\u%04x' % (n,) - else: - # surrogate pair - n -= 0x10000 - s1 = 0xd800 | ((n >> 10) & 0x3ff) - s2 = 0xdc00 | (n & 0x3ff) - return '\\u%04x\\u%04x' % (s1, s2) - return '"' + str(ESCAPE_ASCII.sub(replace, s)) + '"' - -def _encode(s, quote_numbers = True): - if quote_numbers: - qn = '"' - else: - qn = '' - if isinteger(s): - return qn + str(s) + qn - elif isinstance(s, float): - if s != s: - return "NaN" - elif s == PosInf: - return "Infinity" - elif s == NegInf: - return "-Infinity" - else: - return qn + str(s) + qn - else: - return _encode_basestring_ascii(s) - -def encode_key(s): - return _encode(s, quote_numbers=True) - -def encode_value(s): - return _encode(s, quote_numbers=False) +from rdbtools.compat import isinteger +from rdbtools.parser import RdbCallback +from rdbtools import encodehelpers class JSONCallback(RdbCallback): - def __init__(self, out): + def __init__(self, out, string_escape=None): + if string_escape is None: + string_escape = encodehelpers.STRING_ESCAPE_UTF8 + super(JSONCallback, self).__init__(string_escape) self._out = out self._is_first_db = True self._has_databases = False self._is_first_key_in_db = True self._elements_in_key = 0 self._element_index = 0 - + + def encode_key(self, key): + key = encodehelpers.bytes_to_unicode(key, self._escape, skip_printable=True) + return codecs.encode(json.dumps(key), 'utf-8') + + def encode_value(self, val): + val = encodehelpers.bytes_to_unicode(val, self._escape) + return codecs.encode(json.dumps(val), 'utf-8') + def start_rdb(self): - self._out.write('[') + self._out.write(b'[') def start_database(self, db_number): if not self._is_first_db: - self._out.write('},') - self._out.write('{') + self._out.write(b'},') + self._out.write(b'{') self._is_first_db = False self._has_databases = True self._is_first_key_in_db = True @@ -127,13 +43,13 @@ def end_database(self, db_number): def end_rdb(self): if self._has_databases: - self._out.write('}') - self._out.write(']') + self._out.write(b'}') + self._out.write(b']') def _start_key(self, key, length): if not self._is_first_key_in_db: - self._out.write(',') - self._out.write('\r\n') + self._out.write(b',') + self._out.write(b'\r\n') self._is_first_key_in_db = False self._elements_in_key = length self._element_index = 0 @@ -143,69 +59,70 @@ def _end_key(self, key): def _write_comma(self): if self._element_index > 0 and self._element_index < self._elements_in_key : - self._out.write(',') + self._out.write(b',') self._element_index = self._element_index + 1 def set(self, key, value, expiry, info): self._start_key(key, 0) - self._out.write('%s:%s' % (encode_key(key), encode_value(value))) + self._out.write(self.encode_key(key) + b':' + self.encode_value(value)) def start_hash(self, key, length, expiry, info): self._start_key(key, length) - self._out.write('%s:{' % encode_key(key)) + self._out.write(self.encode_key(key) + b':{') def hset(self, key, field, value): self._write_comma() - self._out.write('%s:%s' % (encode_key(field), encode_value(value))) + self._out.write(self.encode_key(field) + b':' + self.encode_value(value)) def end_hash(self, key): self._end_key(key) - self._out.write('}') + self._out.write(b'}') def start_set(self, key, cardinality, expiry, info): self._start_key(key, cardinality) - self._out.write('%s:[' % encode_key(key)) + self._out.write(self.encode_key(key) + b':[') def sadd(self, key, member): self._write_comma() - self._out.write('%s' % encode_value(member)) + self._out.write(self.encode_value(member)) def end_set(self, key): self._end_key(key) - self._out.write(']') + self._out.write(b']') def start_list(self, key, expiry, info): self._start_key(key, 0) - self._out.write('%s:[' % encode_key(key)) + self._out.write(self.encode_key(key) + b':[') def rpush(self, key, value) : self._elements_in_key += 1 self._write_comma() - self._out.write('%s' % encode_value(value)) + self._out.write(self.encode_value(value)) def end_list(self, key, info): self._end_key(key) - self._out.write(']') + self._out.write(b']') def start_sorted_set(self, key, length, expiry, info): self._start_key(key, length) - self._out.write('%s:{' % encode_key(key)) + self._out.write(self.encode_key(key) + b':{') def zadd(self, key, score, member): self._write_comma() - self._out.write('%s:%s' % (encode_key(member), encode_value(score))) + self._out.write(self.encode_key(member) + b':' + self.encode_value(score)) def end_sorted_set(self, key): self._end_key(key) - self._out.write('}') + self._out.write(b'}') class KeysOnlyCallback(RdbCallback): - def __init__(self, out): + def __init__(self, out, string_escape=None): + super(KeysOnlyCallback, self).__init__(string_escape) self._out = out def _keyout(self, key): - self._out.write('%s\n' % (encode_key(key))) + self._out.write(self.encode_key(key) + b'\n') def set(self, key, value, expiry, info): self._keyout(key) @@ -236,7 +153,8 @@ def zadd(self, key, score, member): class KeyValsOnlyCallback(RdbCallback): - def __init__(self, out): + def __init__(self, out, string_escape=None): + super(KeyValsOnlyCallback, self).__init__(string_escape) self._out = out self._is_first_db = True self._has_databases = False @@ -246,8 +164,8 @@ def __init__(self, out): def _start_key(self, key, length): if not self._is_first_key_in_db: - self._out.write(',') - self._out.write('\r\n') + self._out.write(b',') + self._out.write(b'\r\n') self._is_first_key_in_db = False self._elements_in_key = length self._element_index = 0 @@ -257,54 +175,54 @@ def _end_key(self, key): def _write_comma(self): if self._element_index > 0 and self._element_index < self._elements_in_key : - self._out.write(',') + self._out.write(b',') self._element_index = self._element_index + 1 def set(self, key, value, expiry, info): self._start_key(key, 0) - self._out.write('%s %s' % (encode_key(key), encode_value(value))) + self._out.write(self.encode_key(key) + b' ' + self.encode_value(value)) def start_hash(self, key, length, expiry, info): self._start_key(key, length) - self._out.write('%s ' % encode_key(key)) + self._out.write(self.encode_key(key) + b' ') def hset(self, key, field, value): self._write_comma() - self._out.write('%s %s' % (encode_key(field), encode_value(value))) + self._out.write(self.encode_key(field) + b' ' + self.encode_value(value)) def end_hash(self, key): self._end_key(key) def start_set(self, key, cardinality, expiry, info): self._start_key(key, cardinality) - self._out.write('%s ' % encode_key(key)) + self._out.write(self.encode_key(key) + b' ') def sadd(self, key, member): self._write_comma() - self._out.write('%s' % encode_value(member)) + self._out.write(self.encode_value(member)) def end_set(self, key): self._end_key(key) def start_list(self, key, expiry, info): self._start_key(key, 0) - self._out.write('%s ' % encode_key(key)) + self._out.write(self.encode_key(key) + b' ') def rpush(self, key, value) : self._elements_in_key += 1 self._write_comma() - self._out.write('%s' % encode_value(value)) + self._out.write(self.encode_value(value)) def end_list(self, key, info): self._end_key(key) def start_sorted_set(self, key, length, expiry, info): self._start_key(key, length) - self._out.write('%s ' % encode_key(key)) + self._out.write(self.encode_key(key) + b' ') def zadd(self, key, score, member): self._write_comma() - self._out.write('%s %s' % (encode_key(member), encode_value(score))) + self._out.write(self.encode_key(member) + b' ' + self.encode_value(score)) def end_sorted_set(self, key): self._end_key(key) @@ -313,11 +231,16 @@ def end_sorted_set(self, key): class DiffCallback(RdbCallback): '''Prints the contents of RDB in a format that is unix sort friendly, so that two rdb files can be diffed easily''' - def __init__(self, out): + def __init__(self, out, string_escape=None): + if string_escape is None: + string_escape = encodehelpers.STRING_ESCAPE_PRINT + super(DiffCallback, self).__init__(string_escape) self._out = out self._index = 0 self._dbnum = 0 - + + def dbstr(self): + return b'db=' + encodehelpers.int2bytes(self._dbnum) + b' ' def start_rdb(self): pass @@ -331,14 +254,15 @@ def end_rdb(self): pass def set(self, key, value, expiry, info): - self._out.write('db=%d %s -> %s' % (self._dbnum, encode_key(key), encode_value(value))) + self._out.write(self.dbstr() + self.encode_key(key) + b' -> ' + self.encode_value(value)) self.newline() def start_hash(self, key, length, expiry, info): pass def hset(self, key, field, value): - self._out.write('db=%d %s . %s -> %s' % (self._dbnum, encode_key(key), encode_key(field), encode_value(value))) + self._out.write( + self.dbstr() + self.encode_key(key) + b' . ' + self.encode_key(field) + b' -> ' + self.encode_value(value)) self.newline() def end_hash(self, key): @@ -348,7 +272,7 @@ def start_set(self, key, cardinality, expiry, info): pass def sadd(self, key, member): - self._out.write('db=%d %s { %s }' % (self._dbnum, encode_key(key), encode_value(member))) + self._out.write(self.dbstr() + self.encode_key(key) + b' { ' + self.encode_value(member) + b' }') self.newline() def end_set(self, key): @@ -358,7 +282,8 @@ def start_list(self, key, expiry, info): self._index = 0 def rpush(self, key, value) : - self._out.write('db=%d %s[%d] -> %s' % (self._dbnum, encode_key(key), self._index, encode_value(value))) + istr = encodehelpers.int2bytes(self._index) + self._out.write(self.dbstr() + self.encode_key(key) + b'[' + istr + b'] -> ' + self.encode_value(value)) self.newline() self._index = self._index + 1 @@ -369,14 +294,15 @@ def start_sorted_set(self, key, length, expiry, info): pass def zadd(self, key, score, member): - self._out.write('db=%d %s -> {%s, score=%s}' % (self._dbnum, encode_key(key), encode_key(member), encode_value(score))) + self._out.write(self.dbstr() + self.encode_key(key) + + b' -> {' + self.encode_key(member) + b', score=' + self.encode_value(score) + b'}') self.newline() def end_sorted_set(self, key): pass def newline(self): - self._out.write('\r\n') + self._out.write(b'\r\n') def _unix_timestamp(dt): @@ -384,7 +310,8 @@ def _unix_timestamp(dt): class ProtocolCallback(RdbCallback): - def __init__(self, out): + def __init__(self, out, string_escape=None): + super(ProtocolCallback, self).__init__(string_escape) self._out = out self.reset() @@ -411,11 +338,11 @@ def post_expiry(self, key): self.expireat(key, self.get_expiry_seconds(key)) def emit(self, *args): - self._out.write("*{}\r\n".format(len(args))) + self._out.write(codecs.encode("*{}\r\n".format(len(args)), 'ascii')) for arg in args: - val = str(arg) - self._out.write("${}\r\n".format(len(val))) - self._out.write(val + "\r\n") + val = encodehelpers.apply_escape_bytes(arg, self._escape) + self._out.write(codecs.encode("${}\r\n".format(len(val)), 'ascii')) + self._out.write(val + b"\r\n") def start_database(self, db_number): self.reset() @@ -425,7 +352,7 @@ def start_database(self, db_number): def set(self, key, value, expiry, info): self.pre_expiry(key, expiry) - self.emit('SET', key, value) + self.emit(b'SET', key, value) self.post_expiry(key) # Hash handling @@ -434,7 +361,7 @@ def start_hash(self, key, length, expiry, info): self.pre_expiry(key, expiry) def hset(self, key, field, value): - self.emit('HSET', key, field, value) + self.emit(b'HSET', key, field, value) def end_hash(self, key): self.post_expiry(key) @@ -445,7 +372,7 @@ def start_set(self, key, cardinality, expiry, info): self.pre_expiry(key, expiry) def sadd(self, key, member): - self.emit('SADD', key, member) + self.emit(b'SADD', key, member) def end_set(self, key): self.post_expiry(key) @@ -456,7 +383,7 @@ def start_list(self, key, expiry, info): self.pre_expiry(key, expiry) def rpush(self, key, value): - self.emit('RPUSH', key, value) + self.emit(b'RPUSH', key, value) def end_list(self, key, info): self.post_expiry(key) @@ -467,7 +394,7 @@ def start_sorted_set(self, key, length, expiry, info): self.pre_expiry(key, expiry) def zadd(self, key, score, member): - self.emit('ZADD', key, score, member) + self.emit(b'ZADD', key, score, member) def end_sorted_set(self, key): self.post_expiry(key) @@ -475,7 +402,7 @@ def end_sorted_set(self, key): # Other misc commands def select(self, db_number): - self.emit('SELECT', db_number) + self.emit(b'SELECT', db_number) def expireat(self, key, timestamp): - self.emit('EXPIREAT', key, timestamp) + self.emit(b'EXPIREAT', key, timestamp) diff --git a/rdbtools/cli/rdb.py b/rdbtools/cli/rdb.py index 32d7b3e..fddefa9 100755 --- a/rdbtools/cli/rdb.py +++ b/rdbtools/cli/rdb.py @@ -3,6 +3,7 @@ import sys from optparse import OptionParser from rdbtools import RdbParser, JSONCallback, DiffCallback, MemoryCallback, ProtocolCallback, PrintAllKeys, KeysOnlyCallback, KeyValsOnlyCallback +from rdbtools.encodehelpers import ESCAPE_CHOICES VALID_TYPES = ("hash", "set", "string", "list", "sortedset") def main(): @@ -28,7 +29,9 @@ def main(): help="Limit memory output to keys greater to or equal to this value (in bytes)") parser.add_option("-l", "--largest", dest="largest", default=None, help="Limit memory output to only the top N keys (by size)") - + parser.add_option("-e", "--escape", dest="escape", choices=ESCAPE_CHOICES, + help="Escape strings to encoding: {} (default), {}, {}, or {}.".format(*ESCAPE_CHOICES)) + (options, args) = parser.parse_args() if len(args) == 0: @@ -58,25 +61,32 @@ def main(): else: filters['types'].append(x) - if options.output: - f = open(options.output, "wb") - else: - f = sys.stdout - + out_file_obj = None try: - callback = { - 'diff': lambda f: DiffCallback(f), - 'json': lambda f: JSONCallback(f), - 'justkeys': lambda f: KeysOnlyCallback(f), - 'justkeyvals': lambda f: KeyValsOnlyCallback(f), - 'memory': lambda f: MemoryCallback(PrintAllKeys(f, options.bytes, options.largest), 64), - 'protocol': lambda f: ProtocolCallback(f) - }[options.command](f) - except: - raise Exception('Invalid Command %s' % options.command) + if options.output: + out_file_obj = open(options.output, "wb") + else: + # Prefer not to depend on Python stdout implementation for writing binary. + out_file_obj = os.fdopen(sys.stdout.fileno(), 'wb') + + try: + callback = { + 'diff': lambda f: DiffCallback(f, string_escape=options.escape), + 'json': lambda f: JSONCallback(f, string_escape=options.escape), + 'justkeys': lambda f: KeysOnlyCallback(f, string_escape=options.escape), + 'justkeyvals': lambda f: KeyValsOnlyCallback(f, string_escape=options.escape), + 'memory': lambda f: MemoryCallback(PrintAllKeys(f, options.bytes, options.largest), + 64, string_escape=options.escape), + 'protocol': lambda f: ProtocolCallback(f, string_escape=options.escape) + }[options.command](out_file_obj) + except: + raise Exception('Invalid Command %s' % options.command) - parser = RdbParser(callback, filters=filters) - parser.parse(dump_file) + parser = RdbParser(callback, filters=filters) + parser.parse(dump_file) + finally: + if options.output and out_file_obj is not None: + out_file_obj.close() if __name__ == '__main__': main() diff --git a/rdbtools/cli/redis_memory_for_key.py b/rdbtools/cli/redis_memory_for_key.py index 0071ca4..24bc6b9 100755 --- a/rdbtools/cli/redis_memory_for_key.py +++ b/rdbtools/cli/redis_memory_for_key.py @@ -5,15 +5,14 @@ try: try: - from cStringIO import StringIO + from cStringIO import StringIO as BytesIO except ImportError: - from StringIO import StringIO + from StringIO import StringIO as BytesIO except ImportError: - from io import StringIO + from io import BytesIO from optparse import OptionParser from rdbtools import RdbParser, JSONCallback, MemoryCallback -from rdbtools.callbacks import encode_key from redis import StrictRedis from redis.exceptions import ConnectionError, ResponseError @@ -48,14 +47,15 @@ def print_memory_for_key(key, host='localhost', port=6379, db=0, password=None): reporter = PrintMemoryUsage() callback = MemoryCallback(reporter, 64) parser = RdbParser(callback, filters={}) - parser._key = key + # DUMP command only return the key data, so we hack RdbParser to inject key name as parsed bytes. + parser._key = key.encode('utf-8') raw_dump = redis.execute_command('dump', key) if not raw_dump: sys.stderr.write('Key %s does not exist\n' % key) sys.exit(-1) - stream = StringIO(raw_dump) + stream = BytesIO(raw_dump) data_type = read_unsigned_char(stream) parser.read_object(stream, data_type) @@ -88,7 +88,7 @@ def read_unsigned_char(f) : class PrintMemoryUsage(object): def next_record(self, record) : - print("%s\t\t\t\t%s" % ("Key", encode_key(record.key))) + print("%s\t\t\t\t%s" % ("Key", record.key)) print("%s\t\t\t\t%s" % ("Bytes", record.bytes)) print("%s\t\t\t\t%s" % ("Type", record.type)) if record.type in ('set', 'list', 'sortedset', 'hash'): diff --git a/rdbtools/encodehelpers.py b/rdbtools/encodehelpers.py new file mode 100644 index 0000000..ab83a38 --- /dev/null +++ b/rdbtools/encodehelpers.py @@ -0,0 +1,154 @@ +from __future__ import print_function +import base64 +import codecs +import sys + +from .compat import isinteger + +STRING_ESCAPE_RAW = 'raw' +STRING_ESCAPE_PRINT = 'print' +STRING_ESCAPE_UTF8 = 'utf8' +STRING_ESCAPE_BASE64 = 'base64' +ESCAPE_CHOICES = [STRING_ESCAPE_RAW, STRING_ESCAPE_PRINT, STRING_ESCAPE_UTF8, STRING_ESCAPE_BASE64] + +if sys.version_info < (3,): + bval = ord + + def int2unistr(i): return codecs.decode(str(i), 'ascii') + int2bytes = str +else: + def bval(x): return x + + int2unistr = str + + def int2bytes(i): return codecs.encode(str(i), 'ascii') + +ASCII_ESCAPE_LOOKUP = [u'\\x00', u'\\x01', u'\\x02', u'\\x03', u'\\x04', u'\\x05', u'\\x06', u'\\x07', u'\\x08', + u'\\x09', u'\\x0A', u'\\x0B', u'\\x0C', u'\\x0D', u'\\x0E', u'\\x0F', u'\\x10', u'\\x11', + u'\\x12', u'\\x13', u'\\x14', u'\\x15', u'\\x16', u'\\x17', u'\\x18', u'\\x19', u'\\x1A', + u'\\x1B', u'\\x1C', u'\\x1D', u'\\x1E', u'\\x1F', u' ', u'!', u'"', u'#', u'$', u'%', u'&', u"'", + u'(', u')', u'*', u'+', u',', u'-', u'.', u'/', u'0', u'1', u'2', u'3', u'4', u'5', u'6', u'7', + u'8', u'9', u':', u';', u'<', u'=', u'>', u'?', u'@', u'A', u'B', u'C', u'D', u'E', u'F', u'G', + u'H', u'I', u'J', u'K', u'L', u'M', u'N', u'O', u'P', u'Q', u'R', u'S', u'T', u'U', u'V', u'W', + u'X', u'Y', u'Z', u'[', u'\\', u']', u'^', u'_', u'`', u'a', u'b', u'c', u'd', u'e', u'f', u'g', + u'h', u'i', u'j', u'k', u'l', u'm', u'n', u'o', u'p', u'q', u'r', u's', u't', u'u', u'v', u'w', + u'x', u'y', u'z', u'{', u'|', u'}', u'~', u'\\x7F', u'\\x80', u'\\x81', u'\\x82', u'\\x83', + u'\\x84', u'\\x85', u'\\x86', u'\\x87', u'\\x88', u'\\x89', u'\\x8A', u'\\x8B', u'\\x8C', + u'\\x8D', u'\\x8E', u'\\x8F', u'\\x90', u'\\x91', u'\\x92', u'\\x93', u'\\x94', u'\\x95', + u'\\x96', u'\\x97', u'\\x98', u'\\x99', u'\\x9A', u'\\x9B', u'\\x9C', u'\\x9D', u'\\x9E', + u'\\x9F', u'\\xA0', u'\\xA1', u'\\xA2', u'\\xA3', u'\\xA4', u'\\xA5', u'\\xA6', u'\\xA7', + u'\\xA8', u'\\xA9', u'\\xAA', u'\\xAB', u'\\xAC', u'\\xAD', u'\\xAE', u'\\xAF', u'\\xB0', + u'\\xB1', u'\\xB2', u'\\xB3', u'\\xB4', u'\\xB5', u'\\xB6', u'\\xB7', u'\\xB8', u'\\xB9', + u'\\xBA', u'\\xBB', u'\\xBC', u'\\xBD', u'\\xBE', u'\\xBF', u'\\xC0', u'\\xC1', u'\\xC2', + u'\\xC3', u'\\xC4', u'\\xC5', u'\\xC6', u'\\xC7', u'\\xC8', u'\\xC9', u'\\xCA', u'\\xCB', + u'\\xCC', u'\\xCD', u'\\xCE', u'\\xCF', u'\\xD0', u'\\xD1', u'\\xD2', u'\\xD3', u'\\xD4', + u'\\xD5', u'\\xD6', u'\\xD7', u'\\xD8', u'\\xD9', u'\\xDA', u'\\xDB', u'\\xDC', u'\\xDD', + u'\\xDE', u'\\xDF', u'\\xE0', u'\\xE1', u'\\xE2', u'\\xE3', u'\\xE4', u'\\xE5', u'\\xE6', + u'\\xE7', u'\\xE8', u'\\xE9', u'\\xEA', u'\\xEB', u'\\xEC', u'\\xED', u'\\xEE', u'\\xEF', + u'\\xF0', u'\\xF1', u'\\xF2', u'\\xF3', u'\\xF4', u'\\xF5', u'\\xF6', u'\\xF7', u'\\xF8', + u'\\xF9', u'\\xFA', u'\\xFB', u'\\xFC', u'\\xFD', u'\\xFE', u'\\xFF'] + +ASCII_ESCAPE_LOOKUP_BYTES = [b'\\x00', b'\\x01', b'\\x02', b'\\x03', b'\\x04', b'\\x05', b'\\x06', b'\\x07', b'\\x08', + b'\\x09', b'\\x0A', b'\\x0B', b'\\x0C', b'\\x0D', b'\\x0E', b'\\x0F', b'\\x10', b'\\x11', + b'\\x12', b'\\x13', b'\\x14', b'\\x15', b'\\x16', b'\\x17', b'\\x18', b'\\x19', b'\\x1A', + b'\\x1B', b'\\x1C', b'\\x1D', b'\\x1E', b'\\x1F', b' ', b'!', b'"', b'#', b'$', b'%', b'&', + b"'", b'(', b')', b'*', b'+', b',', b'-', b'.', b'/', b'0', b'1', b'2', b'3', b'4', b'5', + b'6', b'7', b'8', b'9', b':', b';', b'<', b'=', b'>', b'?', b'@', b'A', b'B', b'C', b'D', + b'E', b'F', b'G', b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O', b'P', b'Q', b'R', b'S', + b'T', b'U', b'V', b'W', b'X', b'Y', b'Z', b'[', b'\\', b']', b'^', b'_', b'`', b'a', b'b', + b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', + b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'{', b'|', b'}', b'~', b'\\x7F', + b'\\x80', b'\\x81', b'\\x82', b'\\x83', b'\\x84', b'\\x85', b'\\x86', b'\\x87', b'\\x88', + b'\\x89', b'\\x8A', b'\\x8B', b'\\x8C', b'\\x8D', b'\\x8E', b'\\x8F', b'\\x90', b'\\x91', + b'\\x92', b'\\x93', b'\\x94', b'\\x95', b'\\x96', b'\\x97', b'\\x98', b'\\x99', b'\\x9A', + b'\\x9B', b'\\x9C', b'\\x9D', b'\\x9E', b'\\x9F', b'\\xA0', b'\\xA1', b'\\xA2', b'\\xA3', + b'\\xA4', b'\\xA5', b'\\xA6', b'\\xA7', b'\\xA8', b'\\xA9', b'\\xAA', b'\\xAB', b'\\xAC', + b'\\xAD', b'\\xAE', b'\\xAF', b'\\xB0', b'\\xB1', b'\\xB2', b'\\xB3', b'\\xB4', b'\\xB5', + b'\\xB6', b'\\xB7', b'\\xB8', b'\\xB9', b'\\xBA', b'\\xBB', b'\\xBC', b'\\xBD', b'\\xBE', + b'\\xBF', b'\\xC0', b'\\xC1', b'\\xC2', b'\\xC3', b'\\xC4', b'\\xC5', b'\\xC6', b'\\xC7', + b'\\xC8', b'\\xC9', b'\\xCA', b'\\xCB', b'\\xCC', b'\\xCD', b'\\xCE', b'\\xCF', b'\\xD0', + b'\\xD1', b'\\xD2', b'\\xD3', b'\\xD4', b'\\xD5', b'\\xD6', b'\\xD7', b'\\xD8', b'\\xD9', + b'\\xDA', b'\\xDB', b'\\xDC', b'\\xDD', b'\\xDE', b'\\xDF', b'\\xE0', b'\\xE1', b'\\xE2', + b'\\xE3', b'\\xE4', b'\\xE5', b'\\xE6', b'\\xE7', b'\\xE8', b'\\xE9', b'\\xEA', b'\\xEB', + b'\\xEC', b'\\xED', b'\\xEE', b'\\xEF', b'\\xF0', b'\\xF1', b'\\xF2', b'\\xF3', b'\\xF4', + b'\\xF5', b'\\xF6', b'\\xF7', b'\\xF8', b'\\xF9', b'\\xFA', b'\\xFB', b'\\xFC', b'\\xFD', + b'\\xFE', b'\\xFF'] + + +def escape_ascii(bytes_data): + return u''.join(ASCII_ESCAPE_LOOKUP[bval(ch)] for ch in bytes_data) + + +def escape_ascii_bytes(bytes_data): + return b''.join(ASCII_ESCAPE_LOOKUP_BYTES[bval(ch)] for ch in bytes_data) + + +def escape_utf8_error(err): + return escape_ascii(err.object[err.start:err.end]), err.end + +codecs.register_error('rdbslashescape', escape_utf8_error) + + +def escape_utf8(byte_data): + return byte_data.decode('utf-8', 'rdbslashescape') + + +def bytes_to_unicode(byte_data, escape, skip_printable=False): + """ + Decode given bytes using specified escaping method. + :param byte_data: The byte-like object with bytes to decode. + :param escape: The escape method to use. + :param skip_printable: If True, don't escape byte_data with all 'printable ASCII' bytes. Defaults to False. + :return: New unicode string, escaped with the specified method if needed. + """ + if isinteger(byte_data): + if skip_printable: + return int2unistr(byte_data) + else: + byte_data = int2bytes(byte_data) + else: + assert (isinstance(byte_data, type(b''))) + if skip_printable and all(0x20 <= bval(ch) <= 0x7E for ch in byte_data): + escape = STRING_ESCAPE_RAW + + if escape == STRING_ESCAPE_RAW: + return byte_data.decode('latin-1') + elif escape == STRING_ESCAPE_PRINT: + return escape_ascii(byte_data) + elif escape == STRING_ESCAPE_UTF8: + return escape_utf8(byte_data) + elif escape == STRING_ESCAPE_BASE64: + return codecs.decode(base64.b64encode(byte_data), 'latin-1') + else: + raise UnicodeEncodeError("Unknown escape option") + + +def apply_escape_bytes(byte_data, escape, skip_printable=False): + """ + Apply the specified escape method on the given bytes. + :param byte_data: The byte-like object with bytes to escape. + :param escape: The escape method to use. + :param skip_printable: If True, don't escape byte_data with all 'printable ASCII' bytes. Defaults to False. + :return: new bytes object with the escaped bytes or byte_data itself on some no-op cases. + """ + + if isinteger(byte_data): + if skip_printable: + return int2bytes(byte_data) + else: + byte_data = int2bytes(byte_data) + else: + assert (isinstance(byte_data, type(b''))) + if skip_printable and all(0x20 <= bval(ch) <= 0x7E for ch in byte_data): + escape = STRING_ESCAPE_RAW + + if escape == STRING_ESCAPE_RAW: + return byte_data + elif escape == STRING_ESCAPE_PRINT: + return escape_ascii_bytes(byte_data) + elif escape == STRING_ESCAPE_UTF8: + return codecs.encode(escape_utf8(byte_data), 'utf-8') + elif escape == STRING_ESCAPE_BASE64: + return base64.b64encode(byte_data) + else: + raise UnicodeEncodeError("Unknown escape option") diff --git a/rdbtools/memprofiler.py b/rdbtools/memprofiler.py index 844c134..5da4820 100644 --- a/rdbtools/memprofiler.py +++ b/rdbtools/memprofiler.py @@ -1,3 +1,4 @@ +import codecs from collections import namedtuple import random import bisect @@ -8,7 +9,7 @@ import json from rdbtools.parser import RdbCallback -from rdbtools.callbacks import encode_key, encode_value +from rdbtools.encodehelpers import bytes_to_unicode from heapq import heappush, nlargest, heappop @@ -81,8 +82,9 @@ def __init__(self, out, bytes, largest): self._bytes = bytes self._largest = largest self._out = out - self._out.write("%s,%s,%s,%s,%s,%s,%s\n" % ("database", "type", "key", - "size_in_bytes", "encoding", "num_elements", "len_largest_element")) + headers = "%s,%s,%s,%s,%s,%s,%s\n" % ( + "database", "type", "key", "size_in_bytes", "encoding", "num_elements", "len_largest_element") + self._out.write(codecs.encode(headers, 'latin-1')) if self._largest is not None: self._heap = [] @@ -92,8 +94,10 @@ def next_record(self, record) : return # some records are not keys (e.g. dict) if self._largest is None: if self._bytes is None or record.bytes >= int(self._bytes): - self._out.write("%d,%s,%s,%d,%s,%d,%d\n" % (record.database, record.type, encode_key(record.key), - record.bytes, record.encoding, record.size, record.len_largest_element)) + rec_str = "%d,%s,%s,%d,%s,%d,%d\n" % ( + record.database, record.type, record.key, record.bytes, record.encoding, record.size, + record.len_largest_element) + self._out.write(codecs.encode(rec_str, 'latin-1')) else: heappush(self._heap, (record.bytes, record)) @@ -111,14 +115,15 @@ def __init__(self, out): self._out = out def next_record(self, record): - self._out.write("%s\n" % encode_key(record.key)) + self._out.write(codecs.encode("%s\n" % record.key, 'latin-1')) class MemoryCallback(RdbCallback): '''Calculates the memory used if this rdb file were loaded into RAM The memory usage is approximate, and based on heuristics. ''' - def __init__(self, stream, architecture, redis_version='3.2'): + def __init__(self, stream, architecture, redis_version='3.2', string_escape=None): + super(MemoryCallback, self).__init__(string_escape) self._stream = stream self._dbnum = 0 self._current_size = 0 @@ -141,6 +146,12 @@ def __init__(self, stream, architecture, redis_version='3.2'): self._long_size = 4 self._architecture = 32 + def emit_record(self, record_type, key, byte_count, encoding, size, largest_el): + if key is not None: + key = bytes_to_unicode(key, self._escape, skip_printable=True) + record = MemoryRecord(self._dbnum, record_type, key, byte_count, encoding, size, largest_el) + self._stream.next_record(record) + def start_rdb(self): pass @@ -159,10 +170,8 @@ def start_database(self, db_number): self._db_expires = 0 def end_database(self, db_number): - record = MemoryRecord(self._dbnum, "dict", None, self.hashtable_overhead(self._db_keys), None, None, None) - self._stream.next_record(record) - record = MemoryRecord(self._dbnum, "dict", None, self.hashtable_overhead(self._db_expires), None, None, None) - self._stream.next_record(record) + self.emit_record("dict", None, self.hashtable_overhead(self._db_keys), None, None, None) + self.emit_record("dict", None, self.hashtable_overhead(self._db_expires), None, None, None) if hasattr(self._stream, 'end_database'): self._stream.end_database(db_number) @@ -178,8 +187,7 @@ def set(self, key, value, expiry, info): size += self.key_expiry_overhead(expiry) length = element_length(value) - record = MemoryRecord(self._dbnum, "string", key, size, self._current_encoding, length, length) - self._stream.next_record(record) + self.emit_record("string", key, size, self._current_encoding, length, length) self.end_key() def start_hash(self, key, length, expiry, info): @@ -211,8 +219,8 @@ def hset(self, key, field, value): self._current_size += 2*self.robj_overhead() def end_hash(self, key): - record = MemoryRecord(self._dbnum, "hash", key, self._current_size, self._current_encoding, self._current_length, self._len_largest_element) - self._stream.next_record(record) + self.emit_record("hash", key, self._current_size, self._current_encoding, self._current_length, + self._len_largest_element) self.end_key() def start_set(self, key, cardinality, expiry, info): @@ -229,8 +237,8 @@ def sadd(self, key, member): self._current_size += self.robj_overhead() def end_set(self, key): - record = MemoryRecord(self._dbnum, "set", key, self._current_size, self._current_encoding, self._current_length, self._len_largest_element) - self._stream.next_record(record) + self.emit_record("set", key, self._current_size, self._current_encoding, self._current_length, + self._len_largest_element) self.end_key() def start_list(self, key, expiry, info): @@ -292,8 +300,8 @@ def end_list(self, key, info): self._current_size += self.linkedlist_overhead() self._current_size += self.robj_overhead() * self._current_length self._current_size += self._list_items_size - record = MemoryRecord(self._dbnum, "list", key, self._current_size, self._current_encoding, self._current_length, self._len_largest_element) - self._stream.next_record(record) + self.emit_record("list", key, self._current_size, self._current_encoding, self._current_length, + self._len_largest_element) self.end_key() def start_sorted_set(self, key, length, expiry, info): @@ -323,8 +331,8 @@ def zadd(self, key, score, member): self._current_size += self.skiplist_entry_overhead() def end_sorted_set(self, key): - record = MemoryRecord(self._dbnum, "sortedset", key, self._current_size, self._current_encoding, self._current_length, self._len_largest_element) - self._stream.next_record(record) + self.emit_record("sortedset", key, self._current_size, self._current_encoding, self._current_length, + self._len_largest_element) self.end_key() def end_key(self): diff --git a/rdbtools/parser.py b/rdbtools/parser.py index b027d7e..0c03297 100644 --- a/rdbtools/parser.py +++ b/rdbtools/parser.py @@ -3,16 +3,17 @@ import sys import datetime import re + +from rdbtools.encodehelpers import STRING_ESCAPE_RAW, apply_escape_bytes, bval from .compat import range, str2regexp -from io import BytesIO try: try: - from cStringIO import StringIO + from cStringIO import StringIO as BytesIO except ImportError: - from StringIO import StringIO + from StringIO import StringIO as BytesIO except ImportError: - from io import StringIO + from io import BytesIO try: import lzf @@ -59,6 +60,24 @@ class RdbCallback(object): This callback provides a serial and fast access to the dump file. """ + def __init__(self, string_escape): + if string_escape is None: + self._escape = STRING_ESCAPE_RAW + else: + self._escape = string_escape + + def encode_key(self, key): + """ + Escape a given key bytes with the instance chosen escape method. + + Key is not escaped if it contains only 'ASCII printable' bytes. + """ + return apply_escape_bytes(key, self._escape, skip_printable=True) + + def encode_value(self, val): + """Escape a given value bytes with the instance chosen escape method.""" + return apply_escape_bytes(val, self._escape) + def start_rdb(self): """ Called once we know we are dealing with a valid redis dump file @@ -570,7 +589,7 @@ def read_list_from_quicklist(self, f): for i in range(0, count): raw_string = self.read_string(f) total_size += len(raw_string) - buff = StringIO(raw_string) + buff = BytesIO(raw_string) zlbytes = read_unsigned_int(buff) tail_offset = read_unsigned_int(buff) num_entries = read_unsigned_short(buff) diff --git a/tests/__init__.py b/tests/__init__.py index 5f86dec..fab64de 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,11 +1,18 @@ import unittest from tests.parser_tests import RedisParserTestCase from tests.memprofiler_tests import MemoryCallbackTestCase -from tests.callbacks_tests import ProtocolTestCase +from tests.callbacks_tests import ProtocolTestCase, JsonTestCase, DiffTestCase, KeysTestCase, KeyValsTestCase + def all_tests(): suite = unittest.TestSuite() - suite.addTest(unittest.makeSuite(RedisParserTestCase)) - suite.addTest(unittest.makeSuite(MemoryCallbackTestCase)) - suite.addTest(unittest.makeSuite(ProtocolTestCase)) + test_case_list = [RedisParserTestCase, + MemoryCallbackTestCase, + ProtocolTestCase, + JsonTestCase, + DiffTestCase, + KeysTestCase, + KeyValsTestCase] + for case in test_case_list: + suite.addTest(unittest.makeSuite(case)) return suite diff --git a/tests/callbacks_tests.py b/tests/callbacks_tests.py index 04db1be..bb26ba3 100644 --- a/tests/callbacks_tests.py +++ b/tests/callbacks_tests.py @@ -1,20 +1,121 @@ +import os import unittest import random +import sys from io import BytesIO -from rdbtools.callbacks import ProtocolCallback +from rdbtools import RdbParser +from rdbtools import encodehelpers +from rdbtools.callbacks import ProtocolCallback, JSONCallback, DiffCallback, KeysOnlyCallback, KeyValsOnlyCallback -class ProtocolTestCase(unittest.TestCase): +if sys.version_info < (3,): + def rand_bytes(count): + return ''.join(chr(random.randrange(256)) for _ in range(count)) +else: + def rand_bytes(count): + return bytes(random.randrange(256) for _ in range(count)) + +TEST_DUMPS_DIR = 'dumps' + + +class CallbackTester(unittest.TestCase): + """ + General callback tester to use with specific callback tests. + Child class should implement callback_setup() to fill _callback_class, and _fixture attributes. + """ def setUp(self): self._out = BytesIO() - self._callback = ProtocolCallback(self._out) - - def test_emit(self): - utf8_string = '\xd9\xa1\xdf\x82\xe0\xa5\xa9\xe1\xa7\x94\xf0\x91\x8b\xb5' - bcde_non_print = '\x00\x01bcde\r\n\x88\x99' - random_bytes = ''.join(chr(random.randrange(0, 255)) for _ in range(64)) - integer = 46 - self._callback.emit(utf8_string, bcde_non_print, random_bytes, integer) - expected = '\r\n'.join(['*4', '$14', utf8_string, '$10', bcde_non_print, '$64', random_bytes, '$2', '46\r\n']) + self.callback_setup() + + def callback_setup(self): + self._callback_class = None + self._fixture = {'escape_db_file': 'non_ascii_values.rdb'} + + def escape_test_helper(self, escape_name): + if self._callback_class is None: + return # Handle unittest discovery attempt to test with this "abstract" class. + + escape = getattr(encodehelpers, escape_name) + callback = self._callback_class(out=self._out, string_escape=escape) + parser = RdbParser(callback) + parser.parse(os.path.join(os.path.dirname(__file__), TEST_DUMPS_DIR, self._fixture['escape_db_file'])) result = self._out.getvalue() - self.assertEquals(result, expected) + # print('\n%s escape method %s' % (self._callback_class.__name__, escape_name)) + # print("\t\tself._fixture['escape_out_%s'] = %s" % (escape, repr(result))) + # try: + # print(result.decode('utf8')) + # except UnicodeDecodeError: + # print(result.decode('latin-1')) + self.assertEqual(result, + self._fixture['escape_out_' + escape], + msg='\n%s escape method %s' % (self._callback_class.__name__, escape_name) + ) + + def test_raw_escape(self): + """Test using STRING_ESCAPE_RAW with varied key encodings against expected output.""" + self.escape_test_helper('STRING_ESCAPE_RAW') + + def test_print_escape(self): + """Test using STRING_ESCAPE_PRINT with varied key encodings against expected output.""" + self.escape_test_helper('STRING_ESCAPE_PRINT') + + def test_utf8_escape(self): + """Test using STRING_ESCAPE_UTF8 with varied key encodings against expected output.""" + self.escape_test_helper('STRING_ESCAPE_UTF8') + + def test_base64_escape(self): + """Test using STRING_ESCAPE_BASE64 with varied key encodings against expected output.""" + self.escape_test_helper('STRING_ESCAPE_BASE64') + + +class ProtocolTestCase(CallbackTester): + def callback_setup(self): + super(ProtocolTestCase, self).callback_setup() + self._callback_class = ProtocolCallback + self._fixture['escape_out_raw'] = b'*2\r\n$6\r\nSELECT\r\n$1\r\n0\r\n*3\r\n$3\r\nSET\r\n$9\r\nint_value\r\n$3\r\n123\r\n*3\r\n$3\r\nSET\r\n$5\r\nascii\r\n$10\r\n\x00! ~0\n\t\rAb\r\n*3\r\n$3\r\nSET\r\n$3\r\nbin\r\n$14\r\n\x00$ ~0\x7f\xff\n\xaa\t\x80\rAb\r\n*3\r\n$3\r\nSET\r\n$9\r\nprintable\r\n$7\r\n!+ Ab^~\r\n*3\r\n$3\r\nSET\r\n$3\r\n378\r\n$12\r\nint_key_name\r\n*3\r\n$3\r\nSET\r\n$4\r\nutf8\r\n$27\r\n\xd7\x91\xd7\x93\xd7\x99\xd7\xa7\xd7\x94\xf0\x90\x80\x8f123\xd7\xa2\xd7\x91\xd7\xa8\xd7\x99\xd7\xaa\r\n' + self._fixture['escape_out_print'] = b'*2\r\n$6\r\nSELECT\r\n$1\r\n0\r\n*3\r\n$3\r\nSET\r\n$9\r\nint_value\r\n$3\r\n123\r\n*3\r\n$3\r\nSET\r\n$5\r\nascii\r\n$22\r\n\\x00! ~0\\x0A\\x09\\x0DAb\r\n*3\r\n$3\r\nSET\r\n$3\r\nbin\r\n$38\r\n\\x00$ ~0\\x7F\\xFF\\x0A\\xAA\\x09\\x80\\x0DAb\r\n*3\r\n$3\r\nSET\r\n$9\r\nprintable\r\n$7\r\n!+ Ab^~\r\n*3\r\n$3\r\nSET\r\n$3\r\n378\r\n$12\r\nint_key_name\r\n*3\r\n$3\r\nSET\r\n$4\r\nutf8\r\n$99\r\n\\xD7\\x91\\xD7\\x93\\xD7\\x99\\xD7\\xA7\\xD7\\x94\\xF0\\x90\\x80\\x8F123\\xD7\\xA2\\xD7\\x91\\xD7\\xA8\\xD7\\x99\\xD7\\xAA\r\n' + self._fixture['escape_out_utf8'] = b'*2\r\n$6\r\nSELECT\r\n$1\r\n0\r\n*3\r\n$3\r\nSET\r\n$9\r\nint_value\r\n$3\r\n123\r\n*3\r\n$3\r\nSET\r\n$5\r\nascii\r\n$10\r\n\x00! ~0\n\t\rAb\r\n*3\r\n$3\r\nSET\r\n$3\r\nbin\r\n$23\r\n\x00$ ~0\x7f\\xFF\n\\xAA\t\\x80\rAb\r\n*3\r\n$3\r\nSET\r\n$9\r\nprintable\r\n$7\r\n!+ Ab^~\r\n*3\r\n$3\r\nSET\r\n$3\r\n378\r\n$12\r\nint_key_name\r\n*3\r\n$3\r\nSET\r\n$4\r\nutf8\r\n$27\r\n\xd7\x91\xd7\x93\xd7\x99\xd7\xa7\xd7\x94\xf0\x90\x80\x8f123\xd7\xa2\xd7\x91\xd7\xa8\xd7\x99\xd7\xaa\r\n' + self._fixture['escape_out_base64'] = b'*2\r\n$8\r\nU0VMRUNU\r\n$4\r\nMA==\r\n*3\r\n$4\r\nU0VU\r\n$12\r\naW50X3ZhbHVl\r\n$4\r\nMTIz\r\n*3\r\n$4\r\nU0VU\r\n$8\r\nYXNjaWk=\r\n$16\r\nACEgfjAKCQ1BYg==\r\n*3\r\n$4\r\nU0VU\r\n$4\r\nYmlu\r\n$20\r\nACQgfjB//wqqCYANQWI=\r\n*3\r\n$4\r\nU0VU\r\n$12\r\ncHJpbnRhYmxl\r\n$12\r\nISsgQWJefg==\r\n*3\r\n$4\r\nU0VU\r\n$4\r\nMzc4\r\n$16\r\naW50X2tleV9uYW1l\r\n*3\r\n$4\r\nU0VU\r\n$8\r\ndXRmOA==\r\n$36\r\n15HXk9eZ16fXlPCQgI8xMjPXoteR16jXmdeq\r\n' + + +class JsonTestCase(CallbackTester): + def callback_setup(self): + super(JsonTestCase, self).callback_setup() + self._callback_class = JSONCallback + self._fixture['escape_out_raw'] = b'[{\r\n"int_value":"123",\r\n"ascii":"\\u0000! ~0\\n\\t\\rAb",\r\n"bin":"\\u0000$ ~0\\u007f\\u00ff\\n\\u00aa\\t\\u0080\\rAb",\r\n"printable":"!+ Ab^~",\r\n"378":"int_key_name",\r\n"utf8":"\\u00d7\\u0091\\u00d7\\u0093\\u00d7\\u0099\\u00d7\\u00a7\\u00d7\\u0094\\u00f0\\u0090\\u0080\\u008f123\\u00d7\\u00a2\\u00d7\\u0091\\u00d7\\u00a8\\u00d7\\u0099\\u00d7\\u00aa"}]' + self._fixture['escape_out_print'] = b'[{\r\n"int_value":"123",\r\n"ascii":"\\\\x00! ~0\\\\x0A\\\\x09\\\\x0DAb",\r\n"bin":"\\\\x00$ ~0\\\\x7F\\\\xFF\\\\x0A\\\\xAA\\\\x09\\\\x80\\\\x0DAb",\r\n"printable":"!+ Ab^~",\r\n"378":"int_key_name",\r\n"utf8":"\\\\xD7\\\\x91\\\\xD7\\\\x93\\\\xD7\\\\x99\\\\xD7\\\\xA7\\\\xD7\\\\x94\\\\xF0\\\\x90\\\\x80\\\\x8F123\\\\xD7\\\\xA2\\\\xD7\\\\x91\\\\xD7\\\\xA8\\\\xD7\\\\x99\\\\xD7\\\\xAA"}]' + self._fixture['escape_out_utf8'] = b'[{\r\n"int_value":"123",\r\n"ascii":"\\u0000! ~0\\n\\t\\rAb",\r\n"bin":"\\u0000$ ~0\\u007f\\\\xFF\\n\\\\xAA\\t\\\\x80\\rAb",\r\n"printable":"!+ Ab^~",\r\n"378":"int_key_name",\r\n"utf8":"\\u05d1\\u05d3\\u05d9\\u05e7\\u05d4\\ud800\\udc0f123\\u05e2\\u05d1\\u05e8\\u05d9\\u05ea"}]' + self._fixture['escape_out_base64'] = b'[{\r\n"int_value":"MTIz",\r\n"ascii":"ACEgfjAKCQ1BYg==",\r\n"bin":"ACQgfjB//wqqCYANQWI=",\r\n"printable":"ISsgQWJefg==",\r\n"378":"aW50X2tleV9uYW1l",\r\n"utf8":"15HXk9eZ16fXlPCQgI8xMjPXoteR16jXmdeq"}]' + + +class DiffTestCase(CallbackTester): + def callback_setup(self): + super(DiffTestCase, self).callback_setup() + self._callback_class = DiffCallback + self._fixture['escape_out_raw'] = b'db=0 int_value -> 123\r\ndb=0 ascii -> \x00! ~0\n\t\rAb\r\ndb=0 bin -> \x00$ ~0\x7f\xff\n\xaa\t\x80\rAb\r\ndb=0 printable -> !+ Ab^~\r\ndb=0 378 -> int_key_name\r\ndb=0 utf8 -> \xd7\x91\xd7\x93\xd7\x99\xd7\xa7\xd7\x94\xf0\x90\x80\x8f123\xd7\xa2\xd7\x91\xd7\xa8\xd7\x99\xd7\xaa\r\n' + self._fixture['escape_out_print'] = b'db=0 int_value -> 123\r\ndb=0 ascii -> \\x00! ~0\\x0A\\x09\\x0DAb\r\ndb=0 bin -> \\x00$ ~0\\x7F\\xFF\\x0A\\xAA\\x09\\x80\\x0DAb\r\ndb=0 printable -> !+ Ab^~\r\ndb=0 378 -> int_key_name\r\ndb=0 utf8 -> \\xD7\\x91\\xD7\\x93\\xD7\\x99\\xD7\\xA7\\xD7\\x94\\xF0\\x90\\x80\\x8F123\\xD7\\xA2\\xD7\\x91\\xD7\\xA8\\xD7\\x99\\xD7\\xAA\r\n' + self._fixture['escape_out_utf8'] = b'db=0 int_value -> 123\r\ndb=0 ascii -> \x00! ~0\n\t\rAb\r\ndb=0 bin -> \x00$ ~0\x7f\\xFF\n\\xAA\t\\x80\rAb\r\ndb=0 printable -> !+ Ab^~\r\ndb=0 378 -> int_key_name\r\ndb=0 utf8 -> \xd7\x91\xd7\x93\xd7\x99\xd7\xa7\xd7\x94\xf0\x90\x80\x8f123\xd7\xa2\xd7\x91\xd7\xa8\xd7\x99\xd7\xaa\r\n' + self._fixture['escape_out_base64'] = b'db=0 int_value -> MTIz\r\ndb=0 ascii -> ACEgfjAKCQ1BYg==\r\ndb=0 bin -> ACQgfjB//wqqCYANQWI=\r\ndb=0 printable -> ISsgQWJefg==\r\ndb=0 378 -> aW50X2tleV9uYW1l\r\ndb=0 utf8 -> 15HXk9eZ16fXlPCQgI8xMjPXoteR16jXmdeq\r\n' + + +class KeysTestCase(CallbackTester): + def callback_setup(self): + super(KeysTestCase, self).callback_setup() + self._callback_class = KeysOnlyCallback + self._fixture['escape_out_raw'] = b'int_value\nascii\nbin\nprintable\n378\nutf8\n' + self._fixture['escape_out_print'] = b'int_value\nascii\nbin\nprintable\n378\nutf8\n' + self._fixture['escape_out_utf8'] = b'int_value\nascii\nbin\nprintable\n378\nutf8\n' + self._fixture['escape_out_base64'] = b'int_value\nascii\nbin\nprintable\n378\nutf8\n' + + +class KeyValsTestCase(CallbackTester): + def callback_setup(self): + super(KeyValsTestCase, self).callback_setup() + self._callback_class = KeyValsOnlyCallback + self._fixture['escape_out_raw'] = b'\r\nint_value 123,\r\nascii \x00! ~0\n\t\rAb,\r\nbin \x00$ ~0\x7f\xff\n\xaa\t\x80\rAb,\r\nprintable !+ Ab^~,\r\n378 int_key_name,\r\nutf8 \xd7\x91\xd7\x93\xd7\x99\xd7\xa7\xd7\x94\xf0\x90\x80\x8f123\xd7\xa2\xd7\x91\xd7\xa8\xd7\x99\xd7\xaa' + self._fixture['escape_out_print'] = b'\r\nint_value 123,\r\nascii \\x00! ~0\\x0A\\x09\\x0DAb,\r\nbin \\x00$ ~0\\x7F\\xFF\\x0A\\xAA\\x09\\x80\\x0DAb,\r\nprintable !+ Ab^~,\r\n378 int_key_name,\r\nutf8 \\xD7\\x91\\xD7\\x93\\xD7\\x99\\xD7\\xA7\\xD7\\x94\\xF0\\x90\\x80\\x8F123\\xD7\\xA2\\xD7\\x91\\xD7\\xA8\\xD7\\x99\\xD7\\xAA' + self._fixture['escape_out_utf8'] = b'\r\nint_value 123,\r\nascii \x00! ~0\n\t\rAb,\r\nbin \x00$ ~0\x7f\\xFF\n\\xAA\t\\x80\rAb,\r\nprintable !+ Ab^~,\r\n378 int_key_name,\r\nutf8 \xd7\x91\xd7\x93\xd7\x99\xd7\xa7\xd7\x94\xf0\x90\x80\x8f123\xd7\xa2\xd7\x91\xd7\xa8\xd7\x99\xd7\xaa' + self._fixture['escape_out_base64'] = b'\r\nint_value MTIz,\r\nascii ACEgfjAKCQ1BYg==,\r\nbin ACQgfjB//wqqCYANQWI=,\r\nprintable ISsgQWJefg==,\r\n378 aW50X2tleV9uYW1l,\r\nutf8 15HXk9eZ16fXlPCQgI8xMjPXoteR16jXmdeq' + +if __name__ == '__main__': + unittest.main() diff --git a/tests/dumps/non_ascii_values.rdb b/tests/dumps/non_ascii_values.rdb new file mode 100644 index 0000000000000000000000000000000000000000..337564ff635fde2ff8fbbcd3967250c43fc83c53 GIT binary patch literal 202 zcmWG?b@2=~FfcIx#aWb^l3A=KF z;wUXnP0`Iw%{}y^lZW9S!*4bQ2F}d9lK8U3oYK?-)eNkO#mSkOTnvf|bp~9VypBl> z%t@Jfd<-f;e*J&0Rh$hVG0uV_pr*v6oK$v2Z3V}qxH^V|Rg63!gR)aA Date: Sun, 19 Feb 2017 12:05:01 +0200 Subject: [PATCH 37/41] Update README.md for strings escape, new examples, and some formatting cahnges. --- README.md | 146 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 104 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index 00e1cde..de23f7b 100644 --- a/README.md +++ b/README.md @@ -26,29 +26,83 @@ To install from source : cd redis-rdb-tools sudo python setup.py install -## Converting dump files to JSON ## +# Command line usage examples + +Every run of RDB Tool requires to specify a command to indicate what should be done with the parsed RDB data. +Valid commands are: json, diff, justkeys, justkeyvals and protocol. + +JSON from a two database dump: + + > rdb --command json /var/redis/6379/dump.rdb -Parse the dump file and print the JSON on standard output + [{ + "user003":{"fname":"Ron","sname":"Bumquist"}, + "lizards":["Bush anole","Jackson's chameleon","Komodo dragon","Ground agama","Bearded dragon"], + "user001":{"fname":"Raoul","sname":"Duke"}, + "user002":{"fname":"Gonzo","sname":"Dr"}, + "user_list":["user003","user002","user001"]},{ + "baloon":{"helium":"birthdays","medical":"angioplasty","weather":"meteorology"}, + "armadillo":["chacoan naked-tailed","giant","Andean hairy","nine-banded","pink fairy"], + "aroma":{"pungent":"vinegar","putrid":"rotten eggs","floral":"roses"}}] - rdb --command json /var/redis/6379/dump.rdb +## Filter parsed output + +Only process keys that match the regex, and only print key and values: + + > rdb --command justkeyvals --key "user.*" /var/redis/6379/dump.rdb + + user003 fname Ron,sname Bumquist, + user001 fname Raoul,sname Duke, + user002 fname Gonzo,sname Dr, + user_list user003,user002,user001 -Only process keys that match the regex +Only process hashes starting with "a", in database 2: - rdb --command json --key "user.*" /var/redis/6379/dump.rdb + > rdb -c json --db 2 --type hash --key "a.*" /var/redis/6379/dump.rdb -Only process hashes starting with "a", in database 2 + [{},{ + "aroma":{"pungent":"vinegar","putrid":"rotten eggs","floral":"roses"}}] + +## Converting dump files to JSON ## + +The `json` command output is UTF-8 encoded JSON. +By default, the callback try to parse RDB data using UTF-8 and escape non 'ASCII printable' characters with the `\U` notation, or non UTF-8 parsable bytes with `\x`. +Attempting to decode RDB data can lead to binary data curroption, this can be avoided by using the `--escape raw` option. +Another option, is to use `-e base64` for Base64 encoding of binary data. + + +Parse the dump file and print the JSON on standard output: + + > rdb -c json /var/redis/6379/dump.rdb + + [{ + "Citat":["B\u00e4ttre sent \u00e4n aldrig","Bra karl reder sig sj\u00e4lv","Man ska inte k\u00f6pa grisen i s\u00e4cken"], + "bin_data":"\\xFE\u0000\u00e2\\xF2"}] + +Parse the dump file to raw bytes and print the JSON on standard output: - rdb --command json --db 2 --type hash --key "a.*" /var/redis/6379/dump.rdb + > rdb -c json /var/redis/6379/dump.rdb --escape raw + [{ + "Citat":["B\u00c3\u00a4ttre sent \u00c3\u00a4n aldrig","Bra karl reder sig sj\u00c3\u00a4lv","Man ska inte k\u00c3\u00b6pa grisen i s\u00c3\u00a4cken"], + "bin_data":"\u00fe\u0000\u00c3\u00a2\u00f2"}] ## Generate Memory Report ## -Running with the `-c memory` generates a CSV report with the approximate memory used by that key. +Running with the `-c memory` generates a CSV report with the approximate memory used by that key. `--bytes C` and `'--largest N` can be used to limit output to keys larger than C bytes, or the N largest keys. - rdb -c memory /var/redis/6379/dump.rdb > memory.csv + > rdb -c memory /var/redis/6379/dump.rdb --bytes 128 -f memory.csv + > cat memory.csv + database,type,key,size_in_bytes,encoding,num_elements,len_largest_element + 0,list,lizards,241,quicklist,5,19 + 0,list,user_list,190,quicklist,3,7 + 2,hash,baloon,138,ziplist,3,11 + 2,list,armadillo,231,quicklist,5,20 + 2,hash,aroma,129,ziplist,3,11 -The generated CSV has the following columns - Database Number, Data Type, Key, Memory Used in bytes and Encoding. + +The generated CSV has the following columns - Database Number, Data Type, Key, Memory Used in bytes and RDB Encoding type. Memory usage includes the key, the value and any other overheads. Note that the memory usage is approximate. In general, the actual memory used will be slightly higher than what is reported. @@ -61,17 +115,13 @@ The memory report should help you detect memory leaks caused by your application Sometimes you just want to find the memory used by a particular key, and running the entire memory report on the dump file is time consuming. -In such cases, you can use the `redis-memory-for-key` command - -Example : +In such cases, you can use the `redis-memory-for-key` command: - redis-memory-for-key person:1 + > redis-memory-for-key person:1 - redis-memory-for-key -s localhost -p 6379 -a mypassword person:1 - -Output : + > redis-memory-for-key -s localhost -p 6379 -a mypassword person:1 - Key "person:1" + Key person:1 Bytes 111 Type hash Encoding ziplist @@ -87,20 +137,20 @@ NOTE : First, use the --command diff option, and pipe the output to standard sort utility - rdb --command diff /var/redis/6379/dump1.rdb | sort > dump1.txt - rdb --command diff /var/redis/6379/dump2.rdb | sort > dump2.txt + > rdb --command diff /var/redis/6379/dump1.rdb | sort > dump1.txt + > rdb --command diff /var/redis/6379/dump2.rdb | sort > dump2.txt Then, run your favourite diff program - kdiff3 dump1.txt dump2.txt + > kdiff3 dump1.txt dump2.txt -To limit the size of the files, you can filter on keys using the --key=regex option +To limit the size of the files, you can filter on keys using the `--key` option ## Emitting Redis Protocol ## -You can convert RDB file into a stream of [redis protocol](http://redis.io/topics/protocol) using the "protocol" command. +You can convert RDB file into a stream of [redis protocol](http://redis.io/topics/protocol) using the `protocol` command. - rdb --command protocol /var/redis/6379/dump.rdb + > rdb --c protocol /var/redis/6379/dump.rdb *4 $4 @@ -112,37 +162,49 @@ You can convert RDB file into a stream of [redis protocol](http://redis.io/topic $8 Sripathi -You can pipe the output to netcat and re-import a subset of the data. -For example, if you want to shard your data into two redis instances, you can use the --key flag to select a subset of data, +You can pipe the output to netcat and re-import a subset of the data. +For example, if you want to shard your data into two redis instances, you can use the --key flag to select a subset of data, and then pipe the output to a running redis instance to load that data. - Read [Redis Mass Insert](http://redis.io/topics/mass-insert) for more information on this. -## Using the Parser ## +When printing protocol output, the `--escape` option can be used with `printable` or `utf8` to avoid non printable/control characters. + +# Using the Parser ## - import sys from rdbtools import RdbParser, RdbCallback + from rdbtools.encodehelpers import bytes_to_unicode - class MyCallback(RdbCallback) : - ''' Simple example to show how callback works. + class MyCallback(RdbCallback): + ''' Simple example to show how callback works. See RdbCallback for all available callback methods. See JsonCallback for a concrete example - ''' - def set(self, key, value, expiry): - print('%s = %s' % (str(key), str(value))) - + ''' + + def __init__(self): + super(MyCallback, self).__init__(string_escape=None) + + def encode_key(self, key): + return bytes_to_unicode(key, self._escape, skip_printable=True) + + def encode_value(self, val): + return bytes_to_unicode(val, self._escape) + + def set(self, key, value, expiry, info): + print('%s = %s' % (self.encode_key(key), self.encode_value(value))) + def hset(self, key, field, value): - print('%s.%s = %s' % (str(key), str(field), str(value))) - + print('%s.%s = %s' % (self.encode_key(key), self.encode_key(field), self.encode_value(value))) + def sadd(self, key, member): - print('%s has {%s}' % (str(key), str(member))) - - def rpush(self, key, value) : - print('%s has [%s]' % (str(key), str(value))) - + print('%s has {%s}' % (self.encode_key(key), self.encode_value(member))) + + def rpush(self, key, value): + print('%s has [%s]' % (self.encode_key(key), self.encode_value(value))) + def zadd(self, key, score, member): print('%s has {%s : %s}' % (str(key), str(member), str(score))) + callback = MyCallback() parser = RdbParser(callback) parser.parse('/var/redis/6379/dump.rdb') From 51089e11a441c092ea35a5222b873b5042ebe936 Mon Sep 17 00:00:00 2001 From: Guy Benoish Date: Mon, 20 Feb 2017 11:40:01 +0200 Subject: [PATCH 38/41] RDB v8 support --- rdbtools/parser.py | 51 ++++++++++++------ ...b_version_8_with_64b_length_and_scores.rdb | Bin 0 -> 32305 bytes tests/parser_tests.py | 7 +++ 3 files changed, 43 insertions(+), 15 deletions(-) create mode 100644 tests/dumps/rdb_version_8_with_64b_length_and_scores.rdb diff --git a/rdbtools/parser.py b/rdbtools/parser.py index 0c03297..80af17b 100644 --- a/rdbtools/parser.py +++ b/rdbtools/parser.py @@ -23,7 +23,8 @@ REDIS_RDB_6BITLEN = 0 REDIS_RDB_14BITLEN = 1 -REDIS_RDB_32BITLEN = 2 +REDIS_RDB_32BITLEN = 0x80 +REDIS_RDB_64BITLEN = 0x81 REDIS_RDB_ENCVAL = 3 REDIS_RDB_OPCODE_AUX = 250 @@ -38,6 +39,8 @@ REDIS_RDB_TYPE_SET = 2 REDIS_RDB_TYPE_ZSET = 3 REDIS_RDB_TYPE_HASH = 4 +REDIS_RDB_TYPE_ZSET_2 = 5 # ZSET version 2 with doubles stored in binary. +REDIS_RDB_TYPE_MODULE = 6 REDIS_RDB_TYPE_HASH_ZIPMAP = 9 REDIS_RDB_TYPE_LIST_ZIPLIST = 10 REDIS_RDB_TYPE_SET_INTSET = 11 @@ -51,7 +54,7 @@ REDIS_RDB_ENC_LZF = 3 DATA_TYPE_MAPPING = { - 0 : "string", 1 : "list", 2 : "set", 3 : "sortedset", 4 : "hash", + 0 : "string", 1 : "list", 2 : "set", 3 : "sortedset", 4 : "hash", 5 : "sortedset2", 6 : "module", 9 : "hash", 10 : "list", 11 : "set", 12 : "sortedset", 13 : "hash", 14 : "list"} class RdbCallback(object): @@ -393,8 +396,12 @@ def read_length_with_encoding(self, f) : elif enc_type == REDIS_RDB_14BITLEN : bytes.append(read_unsigned_char(f)) length = ((bytes[0]&0x3F)<<8)|bytes[1] - else : + elif bytes[0] == REDIS_RDB_32BITLEN: length = ntohl(f) + elif bytes[0] == REDIS_RDB_64BITLEN: + length = ntohu64(f) + else: + raise Exception('read_length_with_encoding', "Invalid string encoding %s (bytes[0] 0x%X)" % (enc_type, bytes[0])) return (length, is_encoded) def read_length(self, f) : @@ -463,12 +470,15 @@ def read_object(self, f, enc_type) : val = self.read_string(f) self._callback.sadd(self._key, val) self._callback.end_set(self._key) - elif enc_type == REDIS_RDB_TYPE_ZSET : + elif enc_type == REDIS_RDB_TYPE_ZSET or enc_type == REDIS_RDB_TYPE_ZSET_2: length = self.read_length(f) self._callback.start_sorted_set(self._key, length, self._expiry, info={'encoding':'skiplist'}) for count in range(0, length) : val = self.read_string(f) - score = self.read_float(f) + if enc_type == REDIS_RDB_TYPE_ZSET_2 : + score = read_binary_double(f) + else: + score = self.read_float(f) self._callback.zadd(self._key, score, val) self._callback.end_sorted_set(self._key) elif enc_type == REDIS_RDB_TYPE_HASH : @@ -491,6 +501,8 @@ def read_object(self, f, enc_type) : self.read_hash_from_ziplist(f) elif enc_type == REDIS_RDB_TYPE_LIST_QUICKLIST: self.read_list_from_quicklist(f) + elif enc_type == REDIS_RDB_TYPE_MODULE : + raise Exception('read_object', 'Unable to read Redis Modules RDB objects (key %s)' % (enc_type, self._key)) else : raise Exception('read_object', 'Invalid object type %d for key %s' % (enc_type, self._key)) @@ -527,7 +539,7 @@ def skip_object(self, f, enc_type): skip_strings = self.read_length(f) elif enc_type == REDIS_RDB_TYPE_SET : skip_strings = self.read_length(f) - elif enc_type == REDIS_RDB_TYPE_ZSET : + elif enc_type == REDIS_RDB_TYPE_ZSET or enc_type == REDIS_RDB_TYPE_ZSET_2 : skip_strings = self.read_length(f) * 2 elif enc_type == REDIS_RDB_TYPE_HASH : skip_strings = self.read_length(f) * 2 @@ -712,7 +724,7 @@ def verify_magic_string(self, magic_string) : def verify_version(self, version_str) : version = int(version_str) - if version < 1 or version > 7: + if version < 1 or version > 8: raise Exception('verify_version', 'Invalid RDB version number %d' % version) self._rdb_version = version @@ -741,7 +753,7 @@ def init_filter(self, filters): self._filters['not_keys'] = str2regexp(filters['not_keys']) if not 'types' in filters: - self._filters['types'] = ('set', 'hash', 'sortedset', 'string', 'list') + self._filters['types'] = ('set', 'hash', 'sortedset', 'sortedset2', 'module', 'string', 'list') elif isinstance(filters['types'], bytes): self._filters['types'] = (filters['types'], ) elif isinstance(filters['types'], list): @@ -813,14 +825,20 @@ def skip(f, free): if free : f.read(free) +def memrev(arr): + l = len(arr) + new_arr = bytearray(l) + for i in range(l): + new_arr[-i-1] = arr[i] + return str(new_arr) + def ntohl(f) : - val = read_unsigned_int(f) - new_val = 0 - new_val = new_val | ((val & 0x000000ff) << 24) - new_val = new_val | ((val & 0xff000000) >> 24) - new_val = new_val | ((val & 0x0000ff00) << 8) - new_val = new_val | ((val & 0x00ff0000) >> 8) - return new_val + val = memrev(f.read(4)) + return struct.unpack('I', val)[0] + +def ntohu64(f) : + val = memrev(f.read(8)) + return struct.unpack('Q', val)[0] def to_datetime(usecs_since_epoch): seconds_since_epoch = usecs_since_epoch // 1000000 @@ -862,6 +880,9 @@ def read_signed_long(f) : def read_unsigned_long(f) : return struct.unpack('Q', f.read(8))[0] + +def read_binary_double(f) : + return struct.unpack('d', f.read(8))[0] def string_as_hexcode(string) : for s in string : diff --git a/tests/dumps/rdb_version_8_with_64b_length_and_scores.rdb b/tests/dumps/rdb_version_8_with_64b_length_and_scores.rdb new file mode 100644 index 0000000000000000000000000000000000000000..ca7cbe1f13ae1fb302fbc2597580685150037197 GIT binary patch literal 32305 zcmai-KhI!Cb;egQwgU-DnXf>zX#U=rTg62vXrZCq^}dT%v17D$EKx>jETzwfNRuXY zWKmJkQ`6)dAQB1q&BD8ufam9}cHh~VJ9qAx^XECwIqwg?_mAKIVJT(*-@iWj!=G=y zet7Zf)2E+5yng)2`~AE3cWY^H|I4@j@L!(4`s~v$KY07ocR&2>)yEG%`|Q8|^G9!g z_SgUN>BEbsA3uEjv-0k@pS=C?-#+`~<}PYmUHNa`tnkUpm!Cd-_T`h`J}&g+ z{~!PMSC4;r^5pSv@4Wovled3={_OR;kNpf;EdHfpJ95Oz~xb7>@ z?{_U-_E-1XOXj!hu2uHGu6i{&zg_OuXWrlLqb->~)<&H2{8cyr+y5R{mG__PYL}ev z`F@w|@4j2J@v;|x^8RbTTebby-J6}?n7zyX`Vl{QKZK{PmiO1WKbJhetueOjf9!r0 zd;dDiE&DI6*DmX~vH1jf|JRY=9crBe7{-q{JF}q^KW-udVakY+g~fFA31*x z|F-k%yVoh_fA2bqtiM|sOYZ+bfG;`U(K@e?&#!g_SibL^tD1bFpV>lQzs<=B=W#CA zzGb|XbrpI4Yt~Wa{IEZ5$opUSY(@5eU0uKby|55Azg!$ae*eY&s&fC@)%5<=iQggj z1M!_*-oMPbhse*qyXsf0-){E)^Il87?-+C1^XEi^i+uKme9H50)bAzlk9O)+?&moA z!^V5RwtPR-J0h6B|5fT`_OtdVH( zO}SsVPuBtE|cg&hxLj`V{+6Fo3t+-+gV< zzc<&z?DOnpn15bypUwW@oU8wPVg5}1R-M!I@2J(kFXqS2Z*se*>X$;unZM87uW9~0 z>bXtP--!3E<7;A~^Yi<)1rH^UUwc&Z?-9;n^2y70+yBJCcR3%TmudQG?YT|=&9d4M`K){M0rUGK zd~W|oy{epFCH^U*|3EhYF!J;J=$FN}xF@&WuR?fW{te*KW#1qDcN>1N)~F`GwYJ*i ze9)ij-*+}(@f-NZaQ}x3F6UqGOYJlAds%!~u#cwy`@+78{#2g?yU2Xd0X)1c zKU~gk@xk#<%zr5r^(@~H`+Vo4j8+@P*uudsRY^{CsW?0YV1SO^RI3Iz#``ZdRXoI zT*twb`F!4H@fF}tGyDzf(QeW2yL3X}d_HGXzdr`bzJCMV)cB^XV&@0Cgo~W_M|ju` zKf-@j`NO{~zWxc&^7AkJOPkNf!$HaOkM=eDxH&(QUv4iDWL_WpT`a$V|49gy=eJJw zMC5Zdmbd%eJ0Cc&U(ilw9~-+f`%zls33+|c^J)IqxqV&qC(dIv{Eh7a;N|CY{s3Tk z9Pg^D{7(X0nLqA5O}`R77SkUnzh&_k`n#I^4jf#=e?b?9`DZ76VE!xm3lo*k??$@p z{-OOezealCGWkLJ_5R_0&L;AQ@~q~+;{OsL=Jx}9!a>U84SdJGKj;kxD9;~-mdzygu+Q2xA^^ z7t?J%=$QFm7wXC62k;bxK0hD*z19DzM3Ca2kv^Ta{yo<=d};tdoBRQ9@P>K+cpq!X zIL@Wwzx^h>m+wP#*DSv6B-qWrIO)eN^49?X?0%pgZ_5wjd?>#H=WSVhJnOCbE6~x~ z?mzh{jo)!U5s>`-S_h$)`-^-6z~=dQ_v(LReu|wR$t4aruYaEaSLA<;(T(2#d;&xy z>jRIP{9dpl?EaIy(fF41eX;L{do&DR>o*xP+5b2Jw%AwDVNwA0{zN$PeJ26F`ERf@ z0D$xQOb4LM<2U%9;nNsrQq*~V1OB)C8uSN3ygVQ407XQ`NxzwYit@h=pE&#*i+{;a zkwMJQ9|xY6{ox*|e+qbrhRN$=U$@y8{6DwPYm(r~>)#XpoQ$L2QGt0r{;~59$SzoX z0(gl7me(H`Xwz>eyKM0d@Du<`ULW#FHT{1Ggrent{)D_n1~tzIT_;7E$4~x6^-F7f;XJ?L-k5$Y*MLEl=Tlvx{wdCl;`h)4Z}TsR zu5p3$^G|x){LkyeXQJOQuOy)J`WFP5F8&YrHXdZ24?Ug)ejXpti_Jfnpu1-O@$WUi zv>Nnoc|PzM07>2--VY@{c^vx>0FlSXsSl9(#J|d4CcHv{=k-b6@gHRTz&CP#4nAl0 z7wtX_KPVRhDDMY&41|{Vhw*d#gYen(`@!$b{&V}BpLqx_q8~Qd0h4dY{ivw?ez0Gr z*$?cO#xGnh5Zb(cVSk3dFT4Zu-@%6!|E2t6+x_3LH?2OVA9{(&2MW;m1333EWb*ry zeA27g%Q z9|Im@z-52?WRUVW%3bj(Hor|jA$%k_^7;fH+`v2zxDG=qk3&Dz`nyA~HvAQGc`<$j z%j2Tt=O1>W>7NQa(CQZ`-&FgAcc%FGVK15g2{=ZEH}C&m)Gx{X#y{YK%lt!cmid@B z5_EY!_@d@73j`mNuM_{9enmSF!sO@A!%r9aI`mib-++gk;y0&2R-Rvvdnfwo=ns2; zz&ow40R1F`o!_7GYEB@JuXz%nc^vJe{BX+Oi}@dbBM`*$`gjj8T=O`|Tdm)QKkoIr z5YSEk-Ne6!4<2@v;!{*7*#3!+wf?G}`UN>3?k_30d_ITWXZjW6O$A(@4>=PF%;RW3 zGK6{j`1f{xBoEcTo$7qU|FI65Un9G${s;C=^Z(?p_{91B2#>V>;=oVlpL02(kmU6d zUrdmUQ{G|vr;#AC_=o%u44%9``4x>%DK7%z%=1a^X#E}0rQ)ABcUb7Wem?=W**5|t z^RKbKIAD2w+6yQ@6Z#4fK%Re*y|DZ+*;U0?DIZq#W7T*CMUB-Vx`4Zsd_apnR{s+kyQc!t5?3UZ)lg*=|@_hJVYJVZuEW5wJ z>ts-5eehfJUt#wE;n;lWxuU;GzAVMh(VqHz{A2H!ukCvF2F$N@l&5{{Qgt_WBn5f?wiGLIL~Cb^Yf{G!Nbbq z6qn#(%Xpa>h&+z7f07G8`1$=vertXc<3fV)Pv?K@<*R>w_R-5%4;^Q@q36J z@;EmWY&VaS+SdIMnh^cx`RFaBW|H|+==LPe?P^0@ra4o7{vmD|^@p2BY@O#5IFo$L z<0L-mewN26?7hr2!Z1NT%<~WR*sdC-NnF%H!M~rLF+n!Jf_Y$?OwdR$BV>rgNs;FQ|d!*7Et%8bV9wC&0P6 zu2j}&Y6E17-e>vw1lIWec^u}ihCXDb=v|uUqn%WKxqfOtkRH6Rygt_pZbTkmaHB2t zg8Jlw;jJE_0c}+8q>N_OUI~e<98$E^?}AUG{v|Rz32I)?%^iq z@dLbBY7AnuwjRK5*V0KsSFqFkd@3{lf@}Yke@$XYOXF$Wq^)tFhAMT0#$}zkT3Im`o7W|C6Axh9~OGk*-r1rX^dm~17f67J8@1lw1oH# z_#ykJ@r$K?2tBoRGmWh@^apy;)~57r_4|Q${57c~L^Qu2rIktzz`4-YNW^zq8i@MD zLC?=8IDlN7$AN}4b$AMM4O#}6*3xH6Lm^k?=W~CQdIR(f_L%3BIlPTpb{d!1{=tQt zmOjCKx6}-IUy|c_e-wUbYgXzWsBO&i`8XVbi~|su{R8>c(m13?Thq~*4{#_yA7qs7 zsCk^e8SX#c-g=0>@>+V1d4SxOpU?5*q2zIRzs2HnO2ah1r8T&=uEjebz|YTT|Jqsu z+;&-fM)0Mr^-ylwIvnZ2fy?`&bW^Dp@CLXcc|Oi78YGXCH`Mr_yuFvs!ECnpgz!{b z^Kibflk)zEjwqhZ<1puxS_JT~trt$_)9^){7sdCu-#|!te?ZrSka?WU0J-Tre&9W$ z#^Qa#t;q9FV$yrB(PJPVI-gWY4FtSAVq})B)8GWX3Vz`FzNX zYkYz7)?a;taBFLE3d6PaAgwj%O_BFU=^g@>$B|!>vw0liqU$dVkom8tHHpRdL~m-} zPhpkeW8i~YTE2;XEIycn0LkkEPHStSec%J}U+C+nt!HT*rqmve9|tk-58e_Yn8z`$+ByU9SzDJhh~<{Q z!~B!s&(9~iRQ&+Hsjat&egJs#`UtO5)6g!k@A7=m2SSKEeoBKaz5$#d#gyk0etu19 z5dnbc50r!cY9JcRIeta@!tOW8M>1%#f2hZX@1k9_wFA6LPFU8*{WSlPz9^8lALsw> P-~Rm@?|m`; Date: Tue, 21 Feb 2017 09:27:01 +0200 Subject: [PATCH 39/41] fixed sorted set skip mechanism --- rdbtools/memprofiler.py | 37 +++++++++++++++---------------------- rdbtools/parser.py | 27 ++++++++++++++++++--------- 2 files changed, 33 insertions(+), 31 deletions(-) diff --git a/rdbtools/memprofiler.py b/rdbtools/memprofiler.py index 5da4820..0059587 100644 --- a/rdbtools/memprofiler.py +++ b/rdbtools/memprofiler.py @@ -182,9 +182,7 @@ def end_rdb(self): def set(self, key, value, expiry, info): self._current_encoding = info['encoding'] - size = self.sizeof_string(key) + self.sizeof_string(value) + self.top_level_object_overhead() - size += 2*self.robj_overhead() - size += self.key_expiry_overhead(expiry) + size = self.top_level_object_overhead(key, expiry) + self.sizeof_string(value) length = element_length(value) self.emit_record("string", key, size, self._current_encoding, length, length) @@ -193,10 +191,7 @@ def set(self, key, value, expiry, info): def start_hash(self, key, length, expiry, info): self._current_encoding = info['encoding'] self._current_length = length - size = self.sizeof_string(key) - size += 2*self.robj_overhead() - size += self.top_level_object_overhead() - size += self.key_expiry_overhead(expiry) + size = self.top_level_object_overhead(key, expiry) if 'sizeof_value' in info: size += info['sizeof_value'] @@ -216,7 +211,8 @@ def hset(self, key, field, value): self._current_size += self.sizeof_string(field) self._current_size += self.sizeof_string(value) self._current_size += self.hashtable_entry_overhead() - self._current_size += 2*self.robj_overhead() + if self._redis_version < StrictVersion('4.0'): + self._current_size += 2*self.robj_overhead() def end_hash(self, key): self.emit_record("hash", key, self._current_size, self._current_encoding, self._current_length, @@ -234,7 +230,8 @@ def sadd(self, key, member): if self._current_encoding == 'hashtable': self._current_size += self.sizeof_string(member) self._current_size += self.hashtable_entry_overhead() - self._current_size += self.robj_overhead() + if self._redis_version < StrictVersion('4.0'): + self._current_size += self.robj_overhead() def end_set(self, key): self.emit_record("set", key, self._current_size, self._current_encoding, self._current_length, @@ -246,10 +243,7 @@ def start_list(self, key, expiry, info): self._list_items_size = 0 self._list_items_zipped_size = 0 self._current_encoding = info['encoding'] - size = self.sizeof_string(key) - size += 2*self.robj_overhead() - size += self.top_level_object_overhead() - size += self.key_expiry_overhead(expiry) + size = self.top_level_object_overhead(key, expiry) # ignore the encoding in the rdb, and predict the encoding that will be used at the target redis version if self._redis_version >= StrictVersion('3.2'): @@ -298,7 +292,8 @@ def end_list(self, key, info): else: # linkedlist self._current_size += self.linkedlist_entry_overhead() * self._current_length self._current_size += self.linkedlist_overhead() - self._current_size += self.robj_overhead() * self._current_length + if self._redis_version < StrictVersion('4.0'): + self._current_size += self.robj_overhead() * self._current_length self._current_size += self._list_items_size self.emit_record("list", key, self._current_size, self._current_encoding, self._current_length, self._len_largest_element) @@ -307,10 +302,7 @@ def end_list(self, key, info): def start_sorted_set(self, key, length, expiry, info): self._current_length = length self._current_encoding = info['encoding'] - size = self.sizeof_string(key) - size += 2*self.robj_overhead() - size += self.top_level_object_overhead() - size += self.key_expiry_overhead(expiry) + size = self.top_level_object_overhead(key, expiry) if 'sizeof_value' in info: size += info['sizeof_value'] @@ -325,9 +317,10 @@ def zadd(self, key, score, member): self._len_largest_element = element_length(member) if self._current_encoding == 'skiplist': - self._current_size += 8 # self.sizeof_string(score) + self._current_size += 8 # score (double) self._current_size += self.sizeof_string(member) - self._current_size += 2*self.robj_overhead() + if self._redis_version < StrictVersion('4.0'): + self._current_size += self.robj_overhead() self._current_size += self.skiplist_entry_overhead() def end_sorted_set(self, key): @@ -364,10 +357,10 @@ def sizeof_string(self, string): return self.malloc_overhead(l + 1 + 8 + 1) return self.malloc_overhead(l + 1 + 16 + 1) - def top_level_object_overhead(self): + def top_level_object_overhead(self, key, expiry): # Each top level object is an entry in a dictionary, and so we have to include # the overhead of a dictionary entry - return self.hashtable_entry_overhead() + return self.hashtable_entry_overhead() + self.sizeof_string(key) + self.robj_overhead() + self.key_expiry_overhead(expiry) def key_expiry_overhead(self, expiry): # If there is no expiry, there isn't any overhead diff --git a/rdbtools/parser.py b/rdbtools/parser.py index 80af17b..4745ff5 100644 --- a/rdbtools/parser.py +++ b/rdbtools/parser.py @@ -54,7 +54,7 @@ REDIS_RDB_ENC_LZF = 3 DATA_TYPE_MAPPING = { - 0 : "string", 1 : "list", 2 : "set", 3 : "sortedset", 4 : "hash", 5 : "sortedset2", 6 : "module", + 0 : "string", 1 : "list", 2 : "set", 3 : "sortedset", 4 : "hash", 5 : "sortedset", 6 : "module", 9 : "hash", 10 : "list", 11 : "set", 12 : "sortedset", 13 : "hash", 14 : "list"} class RdbCallback(object): @@ -401,7 +401,7 @@ def read_length_with_encoding(self, f) : elif bytes[0] == REDIS_RDB_64BITLEN: length = ntohu64(f) else: - raise Exception('read_length_with_encoding', "Invalid string encoding %s (bytes[0] 0x%X)" % (enc_type, bytes[0])) + raise Exception('read_length_with_encoding', "Invalid string encoding %s (encoding byte 0x%X)" % (enc_type, bytes[0])) return (length, is_encoded) def read_length(self, f) : @@ -470,15 +470,12 @@ def read_object(self, f, enc_type) : val = self.read_string(f) self._callback.sadd(self._key, val) self._callback.end_set(self._key) - elif enc_type == REDIS_RDB_TYPE_ZSET or enc_type == REDIS_RDB_TYPE_ZSET_2: + elif enc_type == REDIS_RDB_TYPE_ZSET or enc_type == REDIS_RDB_TYPE_ZSET_2 : length = self.read_length(f) self._callback.start_sorted_set(self._key, length, self._expiry, info={'encoding':'skiplist'}) for count in range(0, length) : val = self.read_string(f) - if enc_type == REDIS_RDB_TYPE_ZSET_2 : - score = read_binary_double(f) - else: - score = self.read_float(f) + score = read_binary_double(f) if enc_type == REDIS_RDB_TYPE_ZSET_2 else self.read_float(f) self._callback.zadd(self._key, score, val) self._callback.end_sorted_set(self._key) elif enc_type == REDIS_RDB_TYPE_HASH : @@ -530,6 +527,13 @@ def skip_string(self, f): bytes_to_skip = length skip(f, bytes_to_skip) + + def skip_float(self, f): + dbl_length = read_unsigned_char(f) + skip(f, dbl_length if dbl_length < 253 else 1) + + def skip_binary_double(self, f): + skip(f, 8) def skip_object(self, f, enc_type): skip_strings = 0 @@ -540,7 +544,10 @@ def skip_object(self, f, enc_type): elif enc_type == REDIS_RDB_TYPE_SET : skip_strings = self.read_length(f) elif enc_type == REDIS_RDB_TYPE_ZSET or enc_type == REDIS_RDB_TYPE_ZSET_2 : - skip_strings = self.read_length(f) * 2 + length = self.read_length(f) + for x in range(length): + skip_string(f) + skip_binary_double(f) if enc_type == REDIS_RDB_TYPE_ZSET_2 else skip_float(f) elif enc_type == REDIS_RDB_TYPE_HASH : skip_strings = self.read_length(f) * 2 elif enc_type == REDIS_RDB_TYPE_HASH_ZIPMAP : @@ -555,6 +562,8 @@ def skip_object(self, f, enc_type): skip_strings = 1 elif enc_type == REDIS_RDB_TYPE_LIST_QUICKLIST: skip_strings = self.read_length(f) + elif enc_type == REDIS_RDB_TYPE_MODULE: + raise Exception('skip_object', 'Unable to skip Redis Modules RDB objects (key %s)' % (enc_type, self._key)) else : raise Exception('skip_object', 'Invalid object type %d for key %s' % (enc_type, self._key)) for x in range(0, skip_strings): @@ -753,7 +762,7 @@ def init_filter(self, filters): self._filters['not_keys'] = str2regexp(filters['not_keys']) if not 'types' in filters: - self._filters['types'] = ('set', 'hash', 'sortedset', 'sortedset2', 'module', 'string', 'list') + self._filters['types'] = ('set', 'hash', 'sortedset', 'module', 'string', 'list') elif isinstance(filters['types'], bytes): self._filters['types'] = (filters['types'], ) elif isinstance(filters['types'], list): From 3a01ef7d7320a1afa63c76f0cbcd2540db732c7a Mon Sep 17 00:00:00 2001 From: Guy Benoish Date: Tue, 21 Feb 2017 09:59:54 +0200 Subject: [PATCH 40/41] fixed wrong float skip in case of NaN or INF --- rdbtools/parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rdbtools/parser.py b/rdbtools/parser.py index 4745ff5..14c6972 100644 --- a/rdbtools/parser.py +++ b/rdbtools/parser.py @@ -530,7 +530,8 @@ def skip_string(self, f): def skip_float(self, f): dbl_length = read_unsigned_char(f) - skip(f, dbl_length if dbl_length < 253 else 1) + if dbl_length < 253: + skip(f, dbl_length) def skip_binary_double(self, f): skip(f, 8) From 1433f8ff39d74c7976354970a554b0e3cc99ae09 Mon Sep 17 00:00:00 2001 From: oranagra Date: Sun, 26 Feb 2017 02:35:47 -0800 Subject: [PATCH 41/41] bump version to 0.1.9 --- CHANGES | 24 ++++++++++++++---------- rdbtools/__init__.py | 2 +- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/CHANGES b/CHANGES index 9270be2..97d192c 100644 --- a/CHANGES +++ b/CHANGES @@ -1,11 +1,15 @@ -* 0.1.1 - * Fixed lzf decompression - * Standard python project layout - * Python script to automatically create test RDB files - * Adds test cases - * Adds setup.py to easily install this library - * Adds MIT license +* 0.1.9 + * python 3 support + * rdb v8 (redis 4.0) support + * binary to string conversion fixes + * use ujson/cStringIO/python-lzf if they're available + * filter keys by size + * bugfixes parsing sorted sets + * fix setup.py dependancies and remove requirements.txt file + +* 0.1.8 + * fix a crash in the memory profiler recently introduced. + +* 0.1.7 + * rdb v7 (redis 3.2) support -* 0.1.0 - * Initial version - * Specification for RDB file format diff --git a/rdbtools/__init__.py b/rdbtools/__init__.py index 9d4e0b8..8a8d0d8 100644 --- a/rdbtools/__init__.py +++ b/rdbtools/__init__.py @@ -2,7 +2,7 @@ from rdbtools.callbacks import JSONCallback, DiffCallback, ProtocolCallback, KeyValsOnlyCallback, KeysOnlyCallback from rdbtools.memprofiler import MemoryCallback, PrintAllKeys, StatsAggregator, PrintJustKeys -__version__ = '0.1.7' +__version__ = '0.1.9' VERSION = tuple(map(int, __version__.split('.'))) __all__ = [