From 50c9138606a208d6bf927dc714b753130532b62e Mon Sep 17 00:00:00 2001 From: Ivan Begtin Date: Tue, 1 Feb 2022 00:14:53 +0300 Subject: [PATCH] Added support of BSON files to analyze command --- HISTORY.rst | 4 +-- README.rst | 2 +- undatum/cmds/analyzer.py | 65 ++++++++++++++++++---------------------- undatum/utils.py | 36 ++++++++++++++++++++++ 4 files changed, 68 insertions(+), 39 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index 25e7a3f..5ae2dba 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -4,8 +4,8 @@ History ======= 1.0.12 (2022-01-30) -------------------- -* Added command "analyze" it provides human-readable information about data files: CSV, JSON lines, JSON, XML. Detects encoding, delimiters, type of files, fields with objects for JSON and XML files. Doesn't support Gzipped, ZIPped and other comressed files yet. +------------------- +* Added command "analyze" it provides human-readable information about data files: CSV, BSON, JSON lines, JSON, XML. Detects encoding, delimiters, type of files, fields with objects for JSON and XML files. Doesn't support Gzipped, ZIPped and other comressed files yet. 1.0.11 (2022-01-30) ------------------- diff --git a/README.rst b/README.rst index 4ea76c8..55f1d7b 100644 --- a/README.rst +++ b/README.rst @@ -328,7 +328,7 @@ Analyzes data format and provides human-readable information. Returned values will include: * Filename - name of the file -* File type - type of the file, could be: jsonl, xml, csv, json +* File type - type of the file, could be: jsonl, xml, csv, json, bson * Encoding - file encoding * Delimiter - file delimiter if CSV file * File size - size of the file, bytes diff --git a/undatum/cmds/analyzer.py b/undatum/cmds/analyzer.py index 27e0a4d..4757ee8 100644 --- a/undatum/cmds/analyzer.py +++ b/undatum/cmds/analyzer.py @@ -1,5 +1,7 @@ +# -*- coding: utf8 -*- +# FIXME: A lot of unoptimized code here, it could be better, shorter and some functions could be improved import os -from ..utils import get_file_type, get_option, dict_generator, guess_int_size, guess_datatype, detect_delimiter, detect_encoding, get_dict_value +from ..utils import get_file_type, get_option, dict_generator, guess_int_size, guess_datatype, detect_delimiter, detect_encoding, get_dict_value, get_dict_keys, _is_flat, buf_count_newlines_gen from ..constants import SUPPORTED_FILE_TYPES from collections import OrderedDict import bson @@ -11,41 +13,6 @@ import xmltodict OBJECTS_ANALYZE_LIMIT = 100 -def buf_count_newlines_gen(fname): - def _make_gen(reader): - while True: - b = reader(2 ** 16) - if not b: break - yield b - - with open(fname, "rb") as f: - count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read)) - return count - - -def get_dict_keys(iterable, limit=1000): - n = 0 - keys = [] - for item in iterable: - if limit and n > limit: - break - n += 1 - dk = dict_generator(item) - for i in dk: - k = ".".join(i[:-1]) - if k not in keys: - keys.append(k) - return keys - - -def _is_flat(item): - """Measures if object is flat""" - for k, v in item.items(): - if isinstance(v, tuple) or isinstance(v, list): - return False - elif isinstance(v, dict): - if not _is_flat(v): return False - return True def analyze_csv(filename, objects_limit=OBJECTS_ANALYZE_LIMIT): @@ -98,6 +65,30 @@ def analyze_jsonl(filename, objects_limit=OBJECTS_ANALYZE_LIMIT): return report +def analyze_bson(filename, objects_limit=OBJECTS_ANALYZE_LIMIT): + """Analyzes BSON file""" + report = [] + report.append(['Filename', filename]) + report.append(['File type', 'bson']) + report.append(['Filesize', str(os.path.getsize(filename))]) + f = open(filename, 'rb') + flat = True + objects = [] + n = 0 + for o in bson.decode_file_iter(f): + n += 1 + objects.append(o) + if n > objects_limit: + break + f.close() + for o in objects[:objects_limit]: + if not _is_flat(o): + flat = False + report.append(['Is flat table?', str(flat)]) + report.append(['Fields', str('\n'.join(get_dict_keys(objects)))]) + return report + + def analyze_json(filename, objects_limit=OBJECTS_ANALYZE_LIMIT, filesize_limit=500000000): """Analyzes JSON file""" report = [] @@ -275,6 +266,8 @@ def analyze(self, filename, options): table = analyze_csv(filename) elif filetype == 'jsonl': table = analyze_jsonl(filename) + elif filetype == 'bson': + table = analyze_bson(filename) elif filetype == 'json': table = analyze_json(filename) elif filetype == 'xml': diff --git a/undatum/utils.py b/undatum/utils.py index e024445..5b28fdf 100644 --- a/undatum/utils.py +++ b/undatum/utils.py @@ -182,3 +182,39 @@ def guess_datatype(s, qd): attrs = {'base' : 'empty'} return attrs + +def buf_count_newlines_gen(fname): + def _make_gen(reader): + while True: + b = reader(2 ** 16) + if not b: break + yield b + + with open(fname, "rb") as f: + count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read)) + return count + + +def get_dict_keys(iterable, limit=1000): + n = 0 + keys = [] + for item in iterable: + if limit and n > limit: + break + n += 1 + dk = dict_generator(item) + for i in dk: + k = ".".join(i[:-1]) + if k not in keys: + keys.append(k) + return keys + + +def _is_flat(item): + """Measures if object is flat""" + for k, v in item.items(): + if isinstance(v, tuple) or isinstance(v, list): + return False + elif isinstance(v, dict): + if not _is_flat(v): return False + return True