Skip to content

Commit

Permalink
✨ export_json (#1789)
Browse files Browse the repository at this point in the history
  • Loading branch information
maartenbreddels authored Jan 13, 2022
1 parent 1b04e08 commit b54442d
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 0 deletions.
42 changes: 42 additions & 0 deletions packages/vaex-core/vaex/dataframe.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
from __future__ import division, print_function
import io
import difflib
import base64
from typing import Iterable
Expand Down Expand Up @@ -6853,6 +6854,47 @@ def export_csv(self, path, progress=None, chunk_size=default_chunk_size, paralle
progressbar(1.0)
return

@docsubst
def export_json(self, to, progress=None, chunk_size=default_chunk_size, parallel=True, fs_options=None, fs=None):
""" Exports the DataFrame to a CSV file.
:param to: filename or file object
:param progress: {progress}
:param int chunk_size: {chunk_size_export}
:param parallel: {evaluate_parallel}
:param fs_options: {fs_options}
:param fs: {fs}
:return:
"""
json = None # we may want to pass the module as parameter to use a faster library
import json as json_std
json = json or json_std

# not sure if we want to use pandas, it will treat datetime for us, but will convert null to nan
use_pandas = True

# we take on the '[' and ']' from each chunk, and insert it back ourselves
# and we also need to but ',' between each chunk
with vaex.progress.tree(progress, title="export(json)"), vaex.file.open(path=to, mode='wb', fs_options=fs_options, fs=fs) as f:
f.write(b"[")
first = True
if use_pandas:
for _i1, _i2, df in self.to_pandas_df(chunk_size=chunk_size, parallel=parallel):
if not first:
f.write(b", ")
first = False
f_temp = io.BytesIO()
df.to_json(f_temp, orient='records')
f.write(f_temp.getvalue()[1:-1])
else:
for _i1, _i2, records in self.to_records(chunk_size=chunk_size, parallel=parallel):
if not first:
f.write(b", ")
first = False
raw = json.dumps(records)[1:-1]
f.write(raw.encode("utf8"))
f.write(b"]")

def _needs_copy(self, column_name):
import vaex.file.other
return not \
Expand Down
10 changes: 10 additions & 0 deletions tests/export_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,3 +240,13 @@ def test_export_generates_same_hdf5_shasum(tmpdir, dtypes):
shasum2.update(data)

assert shasum1.hexdigest() == shasum2.hexdigest()


def test_export_json(tmpdir, df_filtered):
df = df_filtered
path = tmpdir / 'test.json'
df.export_json(path)
df2 = vaex.from_json(path, orient='records')
# for column in df.get_column_names():
for column in ['x', 'name']:
assert df[column].tolist() == df2[column].tolist()

0 comments on commit b54442d

Please sign in to comment.