Skip to content

Commit

Permalink
Added simple json2jsonl conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
ivbeg committed Jul 24, 2022
1 parent f46191e commit ac43fa8
Showing 1 changed file with 22 additions and 0 deletions.
22 changes: 22 additions & 0 deletions undatum/cmds/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,27 @@ def bson_to_jsonl(fromname, toname, options={}, default_options={}):
source.close()
output.close()


def json_to_jsonl(fromname, toname, options={}, default_options={}):
"""Simple implementation of JSON to JSON lines conversion. Assuming that JSON is an array or dict with 1-st level value with data"""
options = __copy_options(options, default_options)
source = open(fromname, 'rb')
source_data = json.load(source)
data = source_data
if 'tagname' in options.keys():
if isinstance(source_data, dict) and options['tagname'] in source_data.keys():
data = data[options['tagname']]
output = open(toname, 'wb')
n = 0
for r in data:
n += 1
output.write(orjson.dumps(r) + LINEEND)
if n % 10000 == 0:
logging.info('json2jsonl: processed %d records' % (n))
source.close()
output.close()


def csv_to_parquet(fromname, toname, options={}, default_options={'encoding': 'utf8', 'delimiter': ','}):
options = __copy_options(options, default_options)
df = pandas.read_csv(fromname, delimiter=options['delimiter'], encoding=options['encoding'])
Expand All @@ -319,6 +340,7 @@ def jsonl_to_parquet(fromname, toname, options={},
'xml2jsonl': xml_to_jsonl,
'jsonl2csv': jsonl_to_csv,
'bson2jsonl': bson_to_jsonl,
'json2jsonl': json_to_jsonl,
'csv2parquet' : csv_to_parquet,
'jsonl2parquet': jsonl_to_parquet,
}
Expand Down

0 comments on commit ac43fa8

Please sign in to comment.