Skip to content

Commit

Permalink
Improve tests
Browse files Browse the repository at this point in the history
  • Loading branch information
niloc132 committed Dec 5, 2022
1 parent 4b7a291 commit 3aac4be
Showing 1 changed file with 46 additions and 16 deletions.
62 changes: 46 additions & 16 deletions py/server/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from deephaven import empty_table, dtypes, new_table
from deephaven.column import InputColumn
from deephaven.parquet import write, batch_write, read, delete, ColumnInstruction
from deephaven.table import Table

from tests.testbase import BaseTestCase

Expand Down Expand Up @@ -151,35 +152,64 @@ def test_round_trip_data(self):
"""

# create a table with columns to test different types and edge cases
dh_table = empty_table(5).update("Name=(String) null")

# Round-trip the data through parquet:
# * dh->parquet->dataframe (via pyarrow)->dh
# * dh->parquet->dataframe (via pyarrow)->parquet->dh
# * dh->dataframe (via pyarrow)->parquet->dh
dh_table = empty_table(20).update(formulas=[
"someStringColumn = i % 10 == 0?null:(`` + (i % 101))",
"nonNullString = `` + (i % 60)",
"nonNullPolyString = `` + (i % 600)",
"someIntColumn = i",
"someLongColumn = ii",
"someDoubleColumn = i*1.1",
"someFloatColumn = (float)(i*1.1)",
"someBoolColumn = i % 3 == 0?true:i%3 == 1?false:null",
"someShortColumn = (short)i",
"someByteColumn = (byte)i",
"someCharColumn = (char)i",
# TODO pyarrow indicates this value is out of the allowed range
# "someTime = DateTime.now() + i",
"someKey = `` + (int)(i /100)",
"nullKey = i < -1?`123`:null",
"nullIntColumn = (int)null",
"nullLongColumn = (long)null",
"nullDoubleColumn = (double)null",
"nullFloatColumn = (float)null",
"nullBoolColumn = (Boolean)null",
"nullShortColumn = (short)null",
"nullByteColumn = (byte)null",
"nullCharColumn = (char)null",
"nullTime = (DateTime)null",
"nullKey = i == 50 ? `hi` : (String)null",
# TODO BigInteger/BigDecimal columns don't roundtrip cleanly
# "nullBigDecColumn = (java.math.BigDecimal)null",
# "nullBigIntColumn = (java.math.BigInteger)null"
])
# These tests are done with each of the fully-supported compression formats
self.round_trip_with_compression("UNCOMPRESSED", dh_table)
self.round_trip_with_compression("SNAPPY", dh_table)
self.round_trip_with_compression("LZO", dh_table)
self.round_trip_with_compression("LZ4", dh_table)
# LZO is not fully supported in python/c++
# self.round_trip_with_compression("LZO", dh_table)
# TODO(deephaven-core#3148) This test seems to write parquet output with LZ4_RAW as the compression type, Java can't read it
# self.round_trip_with_compression("LZ4", dh_table)
self.round_trip_with_compression("GZIP", dh_table)
self.round_trip_with_compression("ZSTD", dh_table)

self.assertFalse(True, "fail to verify the test is run")

def round_trip_with_compression(self, compression_codec_name, dh_table):
# dh->parquet->dataframe (via pyarrow)->dh
write(dh_table, "data_from_dh.parquet", compression_codec_name=compression_codec_name)
dataframe = pandas.read_parquet('data_from_dh.parquet')
dataframe = pandas.read_parquet('data_from_dh.parquet', use_nullable_dtypes=True)
result_table = to_table(dataframe)
self.assert_table_equals(dh_table, result_table)
dataframe.to_parquet('data_from_pandas.parquet', compression=compression_codec_name)
result_table = read('data_from_pandas.parquet')
self.assert_table_equals(dh_table, result_table)
dataframe = to_pandas(dh_table)
dataframe.to_parquet('data_from_pandas.parquet', compression=compression_codec_name)

# dh->parquet->dataframe (via pyarrow)->parquet->dh
dataframe.to_parquet('data_from_pandas.parquet', compression=None if compression_codec_name is 'UNCOMPRESSED' else compression_codec_name)
result_table = read('data_from_pandas.parquet')
self.assert_table_equals(dh_table, result_table)

# dh->dataframe (via pyarrow)->parquet->dh
# TODO(deephaven-core#3149) disable for now, since to_pandas results in "None" strings instead of None values
# dataframe = to_pandas(dh_table)
# dataframe.to_parquet('data_from_pandas.parquet', compression=None if compression_codec_name is 'UNCOMPRESSED' else compression_codec_name)
# result_table = read('data_from_pandas.parquet')
# self.assert_table_equals(dh_table, result_table)

if __name__ == '__main__':
unittest.main()

0 comments on commit 3aac4be

Please sign in to comment.