✨ export_json (#1789)

vaexio · Jan 13, 2022 · b54442d · b54442d
1 parent 1b04e08
commit b54442d
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 0 deletions.
diff --git a/packages/vaex-core/vaex/dataframe.py b/packages/vaex-core/vaex/dataframe.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 from __future__ import division, print_function
+import io
 import difflib
 import base64
 from typing import Iterable
@@ -6853,6 +6854,47 @@ def export_csv(self, path, progress=None, chunk_size=default_chunk_size, paralle
         progressbar(1.0)
         return
 
+    @docsubst
+    def export_json(self, to, progress=None, chunk_size=default_chunk_size, parallel=True, fs_options=None, fs=None):
+        """ Exports the DataFrame to a CSV file.
+
+        :param to: filename or file object
+        :param progress: {progress}
+        :param int chunk_size: {chunk_size_export}
+        :param parallel: {evaluate_parallel}
+        :param fs_options: {fs_options}
+        :param fs: {fs}
+        :return:
+        """
+        json = None  # we may want to pass the module as parameter to use a faster library
+        import json as json_std
+        json = json or json_std
+
+        # not sure if we want to use pandas, it will treat datetime for us, but will convert null to nan
+        use_pandas = True
+
+        # we take on the '[' and ']' from each chunk, and insert it back ourselves
+        # and we also need to but ',' between each chunk
+        with vaex.progress.tree(progress, title="export(json)"), vaex.file.open(path=to, mode='wb', fs_options=fs_options, fs=fs) as f:
+            f.write(b"[")
+            first = True
+            if use_pandas:
+                for _i1, _i2, df in self.to_pandas_df(chunk_size=chunk_size, parallel=parallel):
+                    if not first:
+                        f.write(b", ")
+                    first = False
+                    f_temp = io.BytesIO()
+                    df.to_json(f_temp, orient='records')
+                    f.write(f_temp.getvalue()[1:-1])
+            else:
+                for _i1, _i2, records in self.to_records(chunk_size=chunk_size, parallel=parallel):
+                    if not first:
+                        f.write(b", ")
+                    first = False
+                    raw = json.dumps(records)[1:-1]
+                    f.write(raw.encode("utf8"))
+            f.write(b"]")
+
     def _needs_copy(self, column_name):
         import vaex.file.other
         return not \

diff --git a/tests/export_test.py b/tests/export_test.py
@@ -240,3 +240,13 @@ def test_export_generates_same_hdf5_shasum(tmpdir, dtypes):
             shasum2.update(data)
 
     assert shasum1.hexdigest() == shasum2.hexdigest()
+
+
+def test_export_json(tmpdir, df_filtered):
+    df = df_filtered
+    path = tmpdir / 'test.json'
+    df.export_json(path)
+    df2 = vaex.from_json(path, orient='records')
+    # for column in df.get_column_names():
+    for column in ['x', 'name']:
+        assert df[column].tolist() == df2[column].tolist()