Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: allow saving wide dataframes to hdf with format table #26135

Closed
wants to merge 34 commits into from
Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
cd35777
Merge pull request #1 from pandas-dev/master
Jul 27, 2017
633be78
added two test cases for storing wide dataframes in table format
Apr 18, 2019
3ba10ef
Added support for wide tables with format 'table'. Columns are saved …
Apr 18, 2019
4c20cdd
cleanup
Apr 18, 2019
7ef9e30
Merge remote-tracking branch 'upstream/master'
Apr 18, 2019
adf378e
Accidently worked on old pandas version. Resolved merge conflicts
Apr 18, 2019
2ecc05e
Linting, cleanup and replaced string_types with str
Apr 18, 2019
6451f8c
Fixed tables import
P-Tillmann Apr 20, 2019
059bbc1
changed test to only check compression filter on table data, not columns
P-Tillmann Apr 20, 2019
1c1f872
added tests for reading columns from legacy tables. Rearranged positi…
P-Tillmann Apr 20, 2019
e4d81bf
Linting
P-Tillmann Apr 20, 2019
37efd62
added legacy hdf file for tests
P-Tillmann Apr 20, 2019
05aac5b
Numpy in windows creates int32 arrays by default. Need to cast to int…
P-Tillmann Apr 23, 2019
8cd08f3
Merge remote-tracking branch 'upstream/master'
Apr 23, 2019
ead6518
Merge remote-tracking branch 'upstream/master'
Apr 23, 2019
c539d9d
added two test cases for storing wide dataframes in table format
Apr 18, 2019
c553ee5
Added support for wide tables with format 'table'. Columns are saved …
Apr 18, 2019
872552b
cleanup
Apr 18, 2019
ee3cdba
Linting, cleanup and replaced string_types with str
Apr 18, 2019
a2c2764
Fixed tables import
P-Tillmann Apr 20, 2019
f8c94cb
changed test to only check compression filter on table data, not columns
P-Tillmann Apr 20, 2019
8484293
added tests for reading columns from legacy tables. Rearranged positi…
P-Tillmann Apr 20, 2019
c3db771
Linting
P-Tillmann Apr 20, 2019
95b193c
added legacy hdf file for tests
P-Tillmann Apr 20, 2019
3684fa6
Numpy in windows creates int32 arrays by default. Need to cast to int…
P-Tillmann Apr 23, 2019
54c1657
celanup
May 17, 2019
99ef34b
Merge branch 'master' of github.com:P-Tillmann/pandas into wide_pytables
May 17, 2019
d3414f2
Merge branch 'wide_pytables' of github.com:P-Tillmann/pandas into wid…
May 17, 2019
3488e1c
Merge branch 'master' of https://github.com/pandas-dev/pandas into wi…
May 20, 2019
f903f29
Rebased to upstream
Jul 26, 2019
96d0ec6
Included unsaved changes for rebase. Fixed typo. Corrected Docstring.
Jul 26, 2019
4d0466e
Black refromatting
Jul 26, 2019
af10a71
Fix for blosc compression test case
Jul 26, 2019
f725d20
black reformating
Jul 26, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 71 additions & 5 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
from pandas.core.index import ensure_index
from pandas.core.internals import BlockManager, _block_shape, make_block

from tables.exceptions import NoSuchNodeError

from pandas.io.common import _stringify_path
from pandas.io.formats.printing import adjoin, pprint_thing

Expand Down Expand Up @@ -1611,6 +1613,7 @@ def infer(self, handler):
"""infer this column from the table: create and return a new object"""
table = handler.table
new_self = self.copy()
new_self._handle = handler._handle
new_self.set_table(table)
new_self.get_attr()
new_self.read_metadata(handler)
Expand Down Expand Up @@ -1668,6 +1671,10 @@ def cvalues(self):
""" return my cython values """
return self.values

@property
def handle(self):
return self._handle

def __iter__(self):
return iter(self.values)

Expand All @@ -1691,6 +1698,7 @@ def validate_names(self):
pass

def validate_and_set(self, handler, append):
self._handle = handler._handle
self.set_table(handler.table)
self.validate_col()
self.validate_attr(append)
Expand Down Expand Up @@ -2230,15 +2238,38 @@ def convert(self, values, nan_rep, encoding, errors):
return self

def get_attr(self):
""" get the data for this column """
""" get the data for this colummn """
# reading tables prior to 0.x.x
self.values = getattr(self.attrs, self.kind_attr, None)

if self.values is None:
try:
data = self.handle.get_node(self.attrs._v_node._v_parent,
self.kind_attr)[:]
data = np.array(data, dtype='object')
# check for multiindex
if len(data.shape) > 1 and data.shape[1] > 1:
self.values = list(map(tuple, data.tolist()))
else:
self.values = data.tolist()
except NoSuchNodeError:
pass

self.dtype = getattr(self.attrs, self.dtype_attr, None)
self.meta = getattr(self.attrs, self.meta_attr, None)
self.set_kind()

def set_attr(self):
""" set the data for this column """
setattr(self.attrs, self.kind_attr, self.values)
group, key = self.attrs._v_node._v_parent, self.kind_attr
if key in group:
self.handle.remove_node(group, key)

vlarray = self.handle.create_vlarray(group, key,
_tables().ObjectAtom())
for fld in self.values:
vlarray.append(fld)

setattr(self.attrs, self.meta_attr, self.meta)
if self.dtype is not None:
setattr(self.attrs, self.dtype_attr, self.dtype)
Expand Down Expand Up @@ -3240,12 +3271,48 @@ def set_info(self):
""" update our table index info """
self.attrs.info = self.info

def set_non_index_axes(self):
""" Write the axes to carrays """
group = self.attrs._v_node

def f(dim, flds):
key = "non_index_axes_%d" % dim
if key in group:
self.handle.remove_node(group, key)

vlarray = self._handle.create_vlarray(group, key,
_tables().ObjectAtom())
for fld in flds:
vlarray.append(fld)
return dim, key

replacement = [f(dim, flds) for dim, flds in self.non_index_axes]
self.attrs.non_index_axes = replacement

def get_non_index_axes(self):
"""Load the non-index axes from their carrays. This is a pass-through
for tables stored prior to v0.xx"""
def f(dim, flds):
if isinstance(flds, str):
flds = self._handle.get_node(self.attrs._v_node, flds)[:]
flds = np.array(flds, dtype='object')
if len(flds.shape) > 1 and flds.shape[1] > 1:
flds = list(map(tuple, flds.tolist()))
else:
flds = flds.tolist()
return dim, flds
else:
return dim, flds # if not a string presumably pre v0.xx list
non_index_axes = getattr(self.attrs, 'non_index_axes', [])
new = [f(dim, flds) for dim, flds in non_index_axes]
return new

def set_attrs(self):
""" set our table type & indexables """
self.attrs.table_type = str(self.table_type)
self.attrs.index_cols = self.index_cols()
self.attrs.values_cols = self.values_cols()
self.attrs.non_index_axes = self.non_index_axes
self.set_non_index_axes()
self.attrs.data_columns = self.data_columns
self.attrs.nan_rep = self.nan_rep
self.attrs.encoding = self.encoding
Expand All @@ -3256,8 +3323,6 @@ def set_attrs(self):

def get_attrs(self):
""" retrieve our attributes """
self.non_index_axes = getattr(
self.attrs, 'non_index_axes', None) or []
self.data_columns = getattr(
self.attrs, 'data_columns', None) or []
self.info = getattr(
Expand All @@ -3276,6 +3341,7 @@ def get_attrs(self):
]
self.metadata = getattr(
self.attrs, 'metadata', None) or []
self.non_index_axes = self.get_non_index_axes()

def validate_version(self, where=None):
""" are we trying to operate on an old version? """
Expand Down
22 changes: 22 additions & 0 deletions pandas/tests/io/test_pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -5148,3 +5148,25 @@ def test_dst_transitions(self):
store.append('df', df)
result = store.select('df')
assert_frame_equal(result, df)

def test_wide_table_format(self):
# test storing wide dataframes with in table format

df = DataFrame(np.random.random((10, 10000)))

with ensure_clean_path(self.path) as path:
df.to_hdf(path, 'df', format='table')
reread = read_hdf(path, 'df')
assert_frame_equal(df, reread)

def test_append_wide_table_format(self):
# test append to hdf with wide dataframe

df1 = DataFrame(np.random.random((10, 10000)))
df2 = DataFrame(np.random.random((10, 10000)))

with ensure_clean_path(self.path) as path:
df1.to_hdf(path, 'df', format='table')
df2.to_hdf(path, 'df', append=True)
reread = read_hdf(path)
assert_frame_equal(pd.concat([df1, df2]), reread)