From 7d636b015c7ebc97265941ca45e73a09af9a2a5c Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 22 Aug 2013 16:54:01 -0400 Subject: [PATCH 1/3] BUG/API: (GH4584) to_hdf was raising when passing both arguments append and table --- doc/source/release.rst | 1 + pandas/io/pytables.py | 6 ++++-- pandas/io/tests/test_pytables.py | 34 ++++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 261bbd424478e..8400dab2a70c7 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -200,6 +200,7 @@ See :ref:`Internal Refactoring` with a different block ordering (:issue:`4096`) - ``read_hdf`` was not respecting as passed ``mode`` (:issue:`4504`) - appending a 0-len table will work correctly (:issue:`4273`) + - ``to_hdf`` was raising when passing both arguments ``append`` and ``table`` (:issue:`4584`) - Fixed bug in tslib.tz_convert(vals, tz1, tz2): it could raise IndexError exception while trying to access trans[pos + 1] (:issue:`4496`) - The ``by`` argument now works correctly with the ``layout`` argument diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 608bbe4703272..4064f97ae0870 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -690,7 +690,7 @@ def remove(self, key, where=None, start=None, stop=None): raise ValueError('can only remove with where on objects written as tables') return s.delete(where = where, start=start, stop=stop) - def append(self, key, value, columns=None, **kwargs): + def append(self, key, value, columns=None, append=True, **kwargs): """ Append to Table in file. Node must already exist and be Table format. @@ -699,6 +699,7 @@ def append(self, key, value, columns=None, **kwargs): ---------- key : object value : {Series, DataFrame, Panel, Panel4D} + append : boolean, default True, append the input data to the existing data_columns : list of columns to create as data columns, or True to use all columns min_itemsize : dict of columns that specify minimum string sizes nan_rep : string to use as string nan represenation @@ -714,7 +715,8 @@ def append(self, key, value, columns=None, **kwargs): if columns is not None: raise Exception("columns is not a supported keyword in append, try data_columns") - self._write_to_group(key, value, table=True, append=True, **kwargs) + kwargs['table'] = True + self._write_to_group(key, value, append=append, **kwargs) def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, **kwargs): """ diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index cfe162c887799..0de50960190e9 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -147,6 +147,40 @@ def roundtrip(key, obj,**kwargs): finally: safe_remove(self.path) + def test_api(self): + + # GH4584 + # API issue when to_hdf doesn't acdept append AND table args + with tm.ensure_clean(self.path) as path: + + df = tm.makeDataFrame() + df.iloc[:10].to_hdf(path,'df',append=True,table=True) + df.iloc[10:].to_hdf(path,'df',append=True,table=True) + assert_frame_equal(read_hdf(path,'df'),df) + + # append to False + df.iloc[:10].to_hdf(path,'df',append=False,table=True) + df.iloc[10:].to_hdf(path,'df',append=True,table=True) + assert_frame_equal(read_hdf(path,'df'),df) + + with tm.ensure_clean(self.path) as path: + + df = tm.makeDataFrame() + df.to_hdf(path,'df',append=False,table=False) + assert_frame_equal(read_hdf(path,'df'),df) + + with ensure_clean(self.path) as store: + + df = tm.makeDataFrame() + store.append('df',df.iloc[:10],append=True,table=True) + store.append('df',df.iloc[10:],append=True,table=True) + assert_frame_equal(read_hdf(path,'df'),df) + + # append to False + store.append('df',df.iloc[:10],append=False,table=True) + store.append('df',df.iloc[10:],append=True,table=True) + assert_frame_equal(read_hdf(path,'df'),df) + def test_keys(self): with ensure_clean(self.path) as store: From a3abf80bfd78c7570ffa06eed4d0b4a0093cb965 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 22 Aug 2013 16:56:29 -0400 Subject: [PATCH 2/3] CLN: pep8 pandas/io/pytables --- pandas/io/pytables.py | 692 ++++++++++++++++++++++++++---------------- 1 file changed, 422 insertions(+), 270 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 4064f97ae0870..0d944afad4d19 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -64,9 +64,11 @@ def _ensure_encoding(encoding): class PossibleDataLossError(Exception): pass + class ClosedFileError(Exception): pass + class IncompatibilityWarning(Warning): pass @@ -76,6 +78,7 @@ class IncompatibilityWarning(Warning): the copy_to method) """ + class AttributeConflictWarning(Warning): pass @@ -84,6 +87,7 @@ class AttributeConflictWarning(Warning): [%s], resetting the attribute to None """ + class DuplicateWarning(Warning): pass @@ -111,30 +115,30 @@ class DuplicateWarning(Warning): # storer class map _STORER_MAP = { - u('TimeSeries') : 'LegacySeriesStorer', - u('Series') : 'LegacySeriesStorer', - u('DataFrame') : 'LegacyFrameStorer', - u('DataMatrix') : 'LegacyFrameStorer', - u('series') : 'SeriesStorer', - u('sparse_series') : 'SparseSeriesStorer', - u('frame') : 'FrameStorer', - u('sparse_frame') : 'SparseFrameStorer', - u('wide') : 'PanelStorer', - u('sparse_panel') : 'SparsePanelStorer', + u('TimeSeries'): 'LegacySeriesStorer', + u('Series'): 'LegacySeriesStorer', + u('DataFrame'): 'LegacyFrameStorer', + u('DataMatrix'): 'LegacyFrameStorer', + u('series'): 'SeriesStorer', + u('sparse_series'): 'SparseSeriesStorer', + u('frame'): 'FrameStorer', + u('sparse_frame'): 'SparseFrameStorer', + u('wide'): 'PanelStorer', + u('sparse_panel'): 'SparsePanelStorer', } # table class map _TABLE_MAP = { - u('generic_table') : 'GenericTable', + u('generic_table'): 'GenericTable', u('appendable_series') : 'AppendableSeriesTable', u('appendable_multiseries'): 'AppendableMultiSeriesTable', - u('appendable_frame') : 'AppendableFrameTable', - u('appendable_multiframe') : 'AppendableMultiFrameTable', - u('appendable_panel') : 'AppendablePanelTable', - u('appendable_ndim') : 'AppendableNDimTable', - u('worm') : 'WORMTable', - u('legacy_frame') : 'LegacyFrameTable', - u('legacy_panel') : 'LegacyPanelTable', + u('appendable_frame'): 'AppendableFrameTable', + u('appendable_multiframe'): 'AppendableMultiFrameTable', + u('appendable_panel'): 'AppendablePanelTable', + u('appendable_ndim'): 'AppendableNDimTable', + u('worm'): 'WORMTable', + u('legacy_frame'): 'LegacyFrameTable', + u('legacy_panel'): 'LegacyPanelTable', } # axes map @@ -163,6 +167,7 @@ def _tables(): return _table_mod + def h5_open(path, mode): tables = _tables() return tables.openFile(path, mode) @@ -192,7 +197,7 @@ def get_store(path, **kwargs): store.close() -### interface to/from ### +# interface to/from ### def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, append=None, **kwargs): """ store this object, close it if we opened it """ @@ -207,6 +212,7 @@ def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, app else: f(path_or_buf) + def read_hdf(path_or_buf, key, **kwargs): """ read from the store, closeit if we opened it @@ -231,13 +237,14 @@ def read_hdf(path_or_buf, key, **kwargs): """ - f = lambda store, auto_close: store.select(key, auto_close=auto_close, **kwargs) + f = lambda store, auto_close: store.select( + key, auto_close=auto_close, **kwargs) if isinstance(path_or_buf, compat.string_types): # can't auto open/close if we are using an iterator # so delegate to the iterator - store = HDFStore(path_or_buf,**kwargs) + store = HDFStore(path_or_buf, **kwargs) try: return f(store, True) except: @@ -253,7 +260,9 @@ def read_hdf(path_or_buf, key, **kwargs): # a passed store; user controls open/close f(path_or_buf, False) + class HDFStore(StringMixin): + """ dict-like IO interface for storing pandas objects in PyTables format. @@ -345,7 +354,8 @@ def __contains__(self, key): node = self.get_node(key) if node is not None: name = node._v_pathname - if name == key or name[1:] == key: return True + if name == key or name[1:] == key: + return True return False def __len__(self): @@ -356,7 +366,7 @@ def __unicode__(self): if self.is_open: lkeys = list(self.keys()) if len(lkeys): - keys = [] + keys = [] values = [] for k in lkeys: @@ -364,10 +374,12 @@ def __unicode__(self): s = self.get_storer(k) if s is not None: keys.append(pprint_thing(s.pathname or k)) - values.append(pprint_thing(s or 'invalid_HDFStore node')) + values.append( + pprint_thing(s or 'invalid_HDFStore node')) except Exception as detail: keys.append(k) - values.append("[invalid_HDFStore node: %s]" % pprint_thing(detail)) + values.append( + "[invalid_HDFStore node: %s]" % pprint_thing(detail)) output += adjoin(12, keys, values) else: @@ -405,14 +417,14 @@ def open(self, mode='a'): if self._mode != mode: # if we are chaning a write mode to read, ok - if self._mode in ['a','w'] and mode in ['r','r+']: + if self._mode in ['a', 'w'] and mode in ['r', 'r+']: pass elif mode in ['w']: # this would truncate, raise here if self.is_open: raise PossibleDataLossError("Re-opening the file [{0}] with mode [{1}] " - "will delete the current file!".format(self._path,self._mode)) + "will delete the current file!".format(self._path, self._mode)) self._mode = mode @@ -449,7 +461,8 @@ def is_open(self): """ return a boolean indicating whether the file is open """ - if self._handle is None: return False + if self._handle is None: + return False return bool(self._handle.isopen) def flush(self): @@ -511,7 +524,8 @@ def func(_start, _stop): if iterator or chunksize is not None: if not s.is_table: - raise TypeError("can only use an iterator or chunksize on a table") + raise TypeError( + "can only use an iterator or chunksize on a table") return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, chunksize=chunksize, auto_close=auto_close) return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, auto_close=auto_close).get_values() @@ -532,7 +546,7 @@ def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs def unique(self, key, column, **kwargs): warnings.warn("unique(key,column) is deprecated\n" "use select_column(key,column).unique() instead") - return self.get_storer(key).read_column(column = column, **kwargs).unique() + return self.get_storer(key).read_column(column=column, **kwargs).unique() def select_column(self, key, column, **kwargs): """ @@ -549,7 +563,7 @@ def select_column(self, key, column, **kwargs): raises ValueError if the column can not be extracted indivually (it is part of a data block) """ - return self.get_storer(key).read_column(column = column, **kwargs) + return self.get_storer(key).read_column(column=column, **kwargs) def select_as_multiple(self, keys, where=None, selector=None, columns=None, start=None, stop=None, iterator=False, chunksize=None, auto_close=False, **kwargs): """ Retrieve pandas objects from multiple tables @@ -585,7 +599,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, star selector = keys[0] # collect the tables - tbls = [ self.get_storer(k) for k in keys ] + tbls = [self.get_storer(k) for k in keys] # validate rows nrows = None @@ -593,16 +607,19 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, star if t is None: raise TypeError("Invalid table [%s]" % k) if not t.is_table: - raise TypeError("object [%s] is not a table, and cannot be used in all select as multiple" % t.pathname) + raise TypeError( + "object [%s] is not a table, and cannot be used in all select as multiple" % t.pathname) if nrows is None: nrows = t.nrows elif t.nrows != nrows: - raise ValueError("all tables must have exactly the same nrows!") + raise ValueError( + "all tables must have exactly the same nrows!") # select coordinates from the selector table try: - c = self.select_as_coordinates(selector, where, start=start, stop=stop) + c = self.select_as_coordinates( + selector, where, start=start, stop=stop) nrows = len(c) except (Exception) as detail: raise ValueError("invalid selector [%s]" % selector) @@ -610,7 +627,8 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, star def func(_start, _stop): # collect the returns objs - objs = [t.read(where=c[_start:_stop], columns=columns) for t in tbls] + objs = [t.read(where=c[_start:_stop], columns=columns) + for t in tbls] # axis is the concentation axes axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0] @@ -623,7 +641,6 @@ def func(_start, _stop): return TableIterator(self, func, nrows=nrows, start=start, stop=stop, auto_close=auto_close).get_values() - def put(self, key, value, table=None, append=False, **kwargs): """ Store object in HDFStore @@ -669,7 +686,8 @@ def remove(self, key, where=None, start=None, stop=None): except: if where is not None: - raise ValueError("trying to remove a node with a non-None where clause!") + raise ValueError( + "trying to remove a node with a non-None where clause!") # we are actually trying to remove a node (with children) s = self.get_node(key) @@ -687,8 +705,9 @@ def remove(self, key, where=None, start=None, stop=None): # delete from the table else: if not s.is_table: - raise ValueError('can only remove with where on objects written as tables') - return s.delete(where = where, start=start, stop=stop) + raise ValueError( + 'can only remove with where on objects written as tables') + return s.delete(where=where, start=start, stop=stop) def append(self, key, value, columns=None, append=True, **kwargs): """ @@ -713,7 +732,8 @@ def append(self, key, value, columns=None, append=True, **kwargs): data in the table, so be careful """ if columns is not None: - raise Exception("columns is not a supported keyword in append, try data_columns") + raise Exception( + "columns is not a supported keyword in append, try data_columns") kwargs['table'] = True self._write_to_group(key, value, append=append, **kwargs) @@ -737,13 +757,16 @@ def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, * """ if axes is not None: - raise Exception("axes is currently not accepted as a paremter to append_to_multiple; you can create the tables indepdently instead") + raise Exception( + "axes is currently not accepted as a paremter to append_to_multiple; you can create the tables indepdently instead") if not isinstance(d, dict): - raise ValueError("append_to_multiple must have a dictionary specified as the way to split the value") + raise ValueError( + "append_to_multiple must have a dictionary specified as the way to split the value") if selector not in d: - raise ValueError("append_to_multiple requires a selector that is in passed dict") + raise ValueError( + "append_to_multiple requires a selector that is in passed dict") # figure out the splitting axis (the non_index_axis) axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0] @@ -754,7 +777,8 @@ def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, * for k, v in d.items(): if v is None: if remain_key is not None: - raise ValueError("append_to_multiple can only have one value in d that is None") + raise ValueError( + "append_to_multiple can only have one value in d that is None") remain_key = k else: remain_values.extend(v) @@ -795,7 +819,8 @@ def create_table_index(self, key, **kwargs): raise Exception("PyTables >= 2.3 is required for table indexing") s = self.get_storer(key) - if s is None: return + if s is None: + return if not s.is_table: raise TypeError("cannot create table index on a non-table") @@ -805,8 +830,8 @@ def groups(self): """ return a list of all the top-level nodes (that are not themselves a pandas storage object) """ _tables() self._check_if_open() - return [ g for g in self._handle.walkNodes() if getattr(g._v_attrs,'pandas_type',None) or getattr( - g,'table',None) or (isinstance(g,_table_mod.table.Table) and g._v_name != u('table')) ] + return [g for g in self._handle.walkNodes() if getattr(g._v_attrs, 'pandas_type', None) or getattr( + g, 'table', None) or (isinstance(g, _table_mod.table.Table) and g._v_name != u('table'))] def get_node(self, key): """ return the node with the key or None if it does not exist """ @@ -827,8 +852,9 @@ def get_storer(self, key): s.infer_axes() return s - def copy(self, file, mode = 'w', propindexes = True, keys = None, complib = None, complevel = None, - fletcher32 = False, overwrite = True): + def copy( + self, file, mode='w', propindexes=True, keys=None, complib = None, complevel = None, + fletcher32=False, overwrite=True): """ copy the existing store to a new file, upgrading in place Parameters @@ -843,13 +869,14 @@ def copy(self, file, mode = 'w', propindexes = True, keys = None, complib = None open file handle of the new store """ - new_store = HDFStore(file, mode = mode, complib = complib, complevel = complevel, fletcher32 = fletcher32) + new_store = HDFStore( + file, mode=mode, complib=complib, complevel=complevel, fletcher32 = fletcher32) if keys is None: keys = list(self.keys()) - if not isinstance(keys, (tuple,list)): - keys = [ keys ] + if not isinstance(keys, (tuple, list)): + keys = [keys] for k in keys: - s = self.get_storer(k) + s = self.get_storer(k) if s is not None: if k in new_store: @@ -861,38 +888,41 @@ def copy(self, file, mode = 'w', propindexes = True, keys = None, complib = None index = False if propindexes: - index = [ a.name for a in s.axes if a.is_indexed ] - new_store.append(k, data, index=index, data_columns=getattr(s,'data_columns',None), encoding=s.encoding) + index = [a.name for a in s.axes if a.is_indexed] + new_store.append(k, data, index=index, data_columns=getattr( + s, 'data_columns', None), encoding=s.encoding) else: new_store.put(k, data, encoding=s.encoding) return new_store - ###### private methods ###### + # private methods ###### def _check_if_open(self): if not self.is_open: raise ClosedFileError("{0} file is not open!".format(self._path)) - def _create_storer(self, group, value = None, table = False, append = False, **kwargs): + def _create_storer(self, group, value=None, table=False, append=False, **kwargs): """ return a suitable Storer class to operate """ def error(t): - raise TypeError("cannot properly create the storer for: [%s] [group->%s,value->%s,table->%s,append->%s,kwargs->%s]" % - (t,group,type(value),table,append,kwargs)) + raise TypeError( + "cannot properly create the storer for: [%s] [group->%s,value->%s,table->%s,append->%s,kwargs->%s]" % + (t, group, type(value), table, append, kwargs)) - pt = _ensure_decoded(getattr(group._v_attrs,'pandas_type',None)) - tt = _ensure_decoded(getattr(group._v_attrs,'table_type',None)) + pt = _ensure_decoded(getattr(group._v_attrs, 'pandas_type', None)) + tt = _ensure_decoded(getattr(group._v_attrs, 'table_type', None)) # infer the pt from the passed value if pt is None: if value is None: _tables() - if getattr(group,'table',None) or isinstance(group,_table_mod.table.Table): + if getattr(group, 'table', None) or isinstance(group, _table_mod.table.Table): pt = u('frame_table') tt = u('generic_table') else: - raise TypeError("cannot create a storer if the object is not existing nor a value are passed") + raise TypeError( + "cannot create a storer if the object is not existing nor a value are passed") else: try: @@ -932,7 +962,7 @@ def error(t): elif index.nlevels > 1: tt = u('appendable_multiframe') elif pt == u('wide_table'): - tt = u('appendable_panel') + tt = u('appendable_panel') elif pt == u('ndim_table'): tt = u('appendable_ndim') @@ -952,7 +982,8 @@ def error(t): except: error('_TABLE_MAP') - def _write_to_group(self, key, value, index=True, table=False, append=False, + def _write_to_group( + self, key, value, index=True, table=False, append=False, complib=None, encoding=None, **kwargs): group = self.get_node(key) @@ -999,16 +1030,19 @@ def _write_to_group(self, key, value, index=True, table=False, append=False, raise ValueError('Compression not supported on non-table') # write the object - s.write(obj = value, append=append, complib=complib, **kwargs) + s.write(obj=value, append=append, complib=complib, **kwargs) + if s.is_table and index: - s.create_index(columns = index) + s.create_index(columns=index) def _read_group(self, group, **kwargs): s = self._create_storer(group) s.infer_axes() return s.read(**kwargs) + class TableIterator(object): + """ define the iteration interface on a table Parameters @@ -1027,13 +1061,13 @@ class TableIterator(object): def __init__(self, store, func, nrows, start=None, stop=None, chunksize=None, auto_close=False): self.store = store - self.func = func + self.func = func self.nrows = nrows or 0 self.start = start or 0 if stop is None: stop = self.nrows - self.stop = min(self.nrows,stop) + self.stop = min(self.nrows, stop) if chunksize is None: chunksize = 100000 @@ -1064,7 +1098,9 @@ def get_values(self): self.close() return results + class IndexCol(StringMixin): + """ an index column description class Parameters @@ -1079,9 +1115,10 @@ class IndexCol(StringMixin): """ is_an_indexable = True is_data_indexable = True - _info_fields = ['freq','tz','index_name'] + _info_fields = ['freq', 'tz', 'index_name'] - def __init__(self, values=None, kind=None, typ=None, cname=None, itemsize=None, + def __init__( + self, values=None, kind=None, typ=None, cname=None, itemsize=None, name=None, axis=None, kind_attr=None, pos=None, freq=None, tz=None, index_name=None, **kwargs): self.values = values @@ -1130,7 +1167,8 @@ def set_table(self, table): return self def __unicode__(self): - temp = tuple(map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind))) + temp = tuple( + map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind))) return "name->%s,cname->%s,axis->%s,pos->%s,kind->%s" % temp def __eq__(self, other): @@ -1144,7 +1182,7 @@ def __ne__(self, other): def is_indexed(self): """ return whether I am an indexed column """ try: - return getattr(self.table.cols,self.cname).is_indexed + return getattr(self.table.cols, self.cname).is_indexed except: False @@ -1166,7 +1204,7 @@ def convert(self, values, nan_rep, encoding): except: pass - values =_maybe_convert(values, self.kind, encoding) + values = _maybe_convert(values, self.kind, encoding) kwargs = dict() if self.freq is not None: @@ -1177,15 +1215,18 @@ def convert(self, values, nan_rep, encoding): self.values = Index(values, **kwargs) except: - # if the output freq is different that what we recorded, then infer it + # if the output freq is different that what we recorded, then infer + # it if 'freq' in kwargs: kwargs['freq'] = 'infer' - self.values = Index(_maybe_convert(values, self.kind, encoding), **kwargs) + self.values = Index( + _maybe_convert(values, self.kind, encoding), **kwargs) # set the timezone if indicated # we stored in utc, so reverse to local timezone if self.tz is not None: - self.values = self.values.tz_localize('UTC').tz_convert(_ensure_decoded(self.tz)) + self.values = self.values.tz_localize( + 'UTC').tz_convert(_ensure_decoded(self.tz)) return self @@ -1248,7 +1289,7 @@ def validate_col(self, itemsize=None): raise ValueError("Trying to store a string with len [%s] in [%s] column but\n" "this column has a limit of [%s]!\n" "Consider using min_itemsize to preset the sizes on these columns" - % (itemsize,self.cname, c.itemsize)) + % (itemsize, self.cname, c.itemsize)) return c.itemsize return None @@ -1267,7 +1308,7 @@ def update_info(self, info): for key in self._info_fields: - value = getattr(self,key,None) + value = getattr(self, key, None) try: idx = info[self.name] @@ -1278,18 +1319,18 @@ def update_info(self, info): if key in idx and value is not None and existing_value != value: # frequency/name just warn - if key in ['freq','index_name']: - ws = attribute_conflict_doc % (key,existing_value,value) + if key in ['freq', 'index_name']: + ws = attribute_conflict_doc % (key, existing_value, value) warnings.warn(ws, AttributeConflictWarning) # reset idx[key] = None - setattr(self,key,None) + setattr(self, key, None) else: raise ValueError("invalid info for [%s] for [%s]""" ", existing_value [%s] conflicts with new value [%s]" % (self.name, - key,existing_value,value)) + key, existing_value, value)) else: if value is not None or existing_value is not None: idx[key] = value @@ -1310,7 +1351,9 @@ def set_attr(self): """ set the kind for this colummn """ setattr(self.attrs, self.kind_attr, self.kind) + class GenericIndexCol(IndexCol): + """ an index which is not represented in the data of the table """ @property @@ -1329,7 +1372,9 @@ def get_attr(self): def set_attr(self): pass + class DataCol(IndexCol): + """ a data holding column, by definition this is not indexable Parameters @@ -1407,11 +1452,12 @@ def set_kind(self): elif dtype.startswith(u('bool')): self.kind = 'bool' else: - raise AssertionError("cannot interpret dtype of [%s] in [%s]" % (dtype,self)) + raise AssertionError( + "cannot interpret dtype of [%s] in [%s]" % (dtype, self)) # set my typ if we need if self.typ is None: - self.typ = getattr(self.description,self.cname,None) + self.typ = getattr(self.description, self.cname, None) def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=None, **kwargs): """ create and setup my atom from the block b """ @@ -1427,7 +1473,7 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No raise TypeError( "[date] is not implemented as a table column") elif inferred_type == 'datetime': - if getattr(rvalues[0],'tzinfo',None) is not None: + if getattr(rvalues[0], 'tzinfo', None) is not None: # if this block has more than one timezone, raise if len(set([r.tzinfo for r in rvalues])) != 1: @@ -1436,7 +1482,7 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No # convert this column to datetime64[ns] utc, and save the tz index = DatetimeIndex(rvalues) - tz = getattr(index,'tz',None) + tz = getattr(index, 'tz', None) if tz is None: raise TypeError( "invalid timezone specification") @@ -1450,7 +1496,8 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No self.tz = zone self.update_info(info) - self.set_atom_datetime64(block, values.reshape(block.values.shape)) + self.set_atom_datetime64( + block, values.reshape(block.values.shape)) else: raise TypeError( @@ -1462,7 +1509,8 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No # this is basically a catchall; if say a datetime64 has nans then will # end up here ### elif inferred_type == 'string' or dtype == 'object': - self.set_atom_string(block, existing_col, min_itemsize, nan_rep, encoding) + self.set_atom_string( + block, existing_col, min_itemsize, nan_rep, encoding) else: self.set_atom_data(block) @@ -1474,13 +1522,14 @@ def get_atom_string(self, block, itemsize): def set_atom_string(self, block, existing_col, min_itemsize, nan_rep, encoding): # fill nan items with myself block = block.fillna(nan_rep)[0] - data = block.values + data = block.values # see if we have a valid string type inferred_type = lib.infer_dtype(data.ravel()) if inferred_type != 'string': - # we cannot serialize this data, so report an exception on a column by column basis + # we cannot serialize this data, so report an exception on a column + # by column basis for item in block.items: col = block.get(item) @@ -1488,8 +1537,7 @@ def set_atom_string(self, block, existing_col, min_itemsize, nan_rep, encoding): if inferred_type != 'string': raise TypeError("Cannot serialize the column [%s] because\n" "its data contents are [%s] object dtype" % - (item,inferred_type)) - + (item, inferred_type)) # itemsize is the maximum length of a string (along any dimension) itemsize = lib.max_len_string_array(com._ensure_object(data.ravel())) @@ -1534,7 +1582,7 @@ def set_atom_data(self, block): def get_atom_datetime64(self, block): return _tables().Int64Col(shape=block.shape[0]) - def set_atom_datetime64(self, block, values = None): + def set_atom_datetime64(self, block, values=None): self.kind = 'datetime64' self.typ = self.get_atom_datetime64(block) if values is None: @@ -1585,8 +1633,10 @@ def convert(self, values, nan_rep, encoding): # data should be 2-dim here # we stored as utc, so just set the tz - index = DatetimeIndex(self.data.ravel(),tz='UTC').tz_convert(self.tz) - self.data = np.array(index.tolist(),dtype=object).reshape(self.data.shape) + index = DatetimeIndex( + self.data.ravel(), tz='UTC').tz_convert(self.tz) + self.data = np.array( + index.tolist(), dtype=object).reshape(self.data.shape) else: self.data = np.asarray(self.data, dtype='M8[ns]') @@ -1607,14 +1657,15 @@ def convert(self, values, nan_rep, encoding): # convert nans / decode if _ensure_decoded(self.kind) == u('string'): - self.data = _unconvert_string_array(self.data, nan_rep=nan_rep, encoding=encoding) + self.data = _unconvert_string_array( + self.data, nan_rep=nan_rep, encoding=encoding) return self def get_attr(self): """ get the data for this colummn """ self.values = getattr(self.attrs, self.kind_attr, None) - self.dtype = getattr(self.attrs, self.dtype_attr, None) + self.dtype = getattr(self.attrs, self.dtype_attr, None) self.set_kind() def set_attr(self): @@ -1625,6 +1676,7 @@ def set_attr(self): class DataIndexableCol(DataCol): + """ represent a data column that can be indexed """ is_data_indexable = True @@ -1637,13 +1689,17 @@ def get_atom_data(self, block): def get_atom_datetime64(self, block): return _tables().Int64Col() + class GenericDataIndexableCol(DataIndexableCol): + """ represent a generic pytables data column """ def get_attr(self): pass + class Storer(StringMixin): + """ represent an object in my store facilitate read/write of various types of objects this is an abstract base class @@ -1655,14 +1711,14 @@ class Storer(StringMixin): group : the group node where the table resides """ pandas_kind = None - obj_type = None - ndim = None - is_table = False + obj_type = None + ndim = None + is_table = False def __init__(self, parent, group, encoding=None, **kwargs): - self.parent = parent - self.group = group - self.encoding = _ensure_encoding(encoding) + self.parent = parent + self.group = group + self.encoding = _ensure_encoding(encoding) self.set_version() @property @@ -1671,7 +1727,8 @@ def is_old_version(self): def set_version(self): """ compute and set our version """ - version = _ensure_decoded(getattr(self.group._v_attrs,'pandas_version',None)) + version = _ensure_decoded( + getattr(self.group._v_attrs, 'pandas_version', None)) try: self.version = tuple([int(x) for x in version.split('.')]) if len(self.version) == 2: @@ -1688,9 +1745,9 @@ def __unicode__(self): self.infer_axes() s = self.shape if s is not None: - if isinstance(s, (list,tuple)): + if isinstance(s, (list, tuple)): s = "[%s]" % ','.join([pprint_thing(x) for x in s]) - return "%-12.12s (shape->%s)" % (self.pandas_type,s) + return "%-12.12s (shape->%s)" % (self.pandas_type, s) return self.pandas_type def set_object_info(self): @@ -1758,14 +1815,15 @@ def is_exists(self): @property def nrows(self): - return getattr(self.storable,'nrows',None) + return getattr(self.storable, 'nrows', None) def validate(self, other): """ validate against an existing storable """ - if other is None: return + if other is None: + return return True - def validate_version(self, where = None): + def validate_version(self, where=None): """ are we trying to operate on an old version? """ return True @@ -1780,12 +1838,14 @@ def infer_axes(self): return True def read(self, **kwargs): - raise NotImplementedError("cannot read on an abstract storer: subclasses should implement") + raise NotImplementedError( + "cannot read on an abstract storer: subclasses should implement") def write(self, **kwargs): - raise NotImplementedError("cannot write on an abstract storer: sublcasses should implement") + raise NotImplementedError( + "cannot write on an abstract storer: sublcasses should implement") - def delete(self, where = None, **kwargs): + def delete(self, where=None, **kwargs): """ support fully deleting the node in its entirety (only) - where specification must be None """ if where is None: self._handle.removeNode(self.group, recursive=True) @@ -1793,11 +1853,14 @@ def delete(self, where = None, **kwargs): raise TypeError("cannot delete on an abstract storer") + class GenericStorer(Storer): + """ a generified storer version """ - _index_type_map = { DatetimeIndex: 'datetime', + _index_type_map = {DatetimeIndex: 'datetime', PeriodIndex: 'period'} - _reverse_index_map = dict([ (v,k) for k, v in compat.iteritems(_index_type_map) ]) + _reverse_index_map = dict([(v, k) + for k, v in compat.iteritems(_index_type_map)]) attributes = [] # indexer helpders @@ -1806,7 +1869,8 @@ def _class_to_alias(self, cls): def _alias_to_class(self, alias): if isinstance(alias, type): # pragma: no cover - return alias # compat: for a short period of time master stored types + # compat: for a short period of time master stored types + return alias return self._reverse_index_map.get(alias, Index) def _get_index_factory(self, klass): @@ -1835,9 +1899,9 @@ def set_attrs(self): def get_attrs(self): """ retrieve our attributes """ - self.encoding = _ensure_encoding(getattr(self.attrs,'encoding',None)) + self.encoding = _ensure_encoding(getattr(self.attrs, 'encoding', None)) for n in self.attributes: - setattr(self,n,_ensure_decoded(getattr(self.attrs, n, None))) + setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None))) def write(self, obj, **kwargs): self.set_attrs() @@ -1898,7 +1962,7 @@ def write_index(self, key, index): self.write_sparse_intindex(key, index) else: setattr(self.attrs, '%s_variety' % key, 'regular') - converted = _convert_index(index,self.encoding).set_name('index') + converted = _convert_index(index, self.encoding).set_name('index') self.write_array(key, converted.values) node = getattr(self.group, key) node._v_attrs.kind = converted.kind @@ -1916,7 +1980,6 @@ def write_index(self, key, index): zone = tslib.tot_seconds(index.tz.utcoffset()) node._v_attrs.tz = zone - def write_block_index(self, key, index): self.write_array('%s_blocs' % key, index.blocs) self.write_array('%s_blengths' % key, index.blengths) @@ -1996,10 +2059,12 @@ def read_index_node(self, node): kwargs['tz'] = node._v_attrs['tz'] if kind in (u('date'), u('datetime')): - index = factory(_unconvert_index(data, kind, encoding=self.encoding), dtype=object, + index = factory( + _unconvert_index(data, kind, encoding=self.encoding), dtype=object, **kwargs) else: - index = factory(_unconvert_index(data, kind, encoding=self.encoding), **kwargs) + index = factory( + _unconvert_index(data, kind, encoding=self.encoding), **kwargs) index.name = name @@ -2050,7 +2115,8 @@ def write_array(self, key, value, items=None): if value.dtype.type == np.object_: - # infer the type, warn if we have a non-string type here (for performance) + # infer the type, warn if we have a non-string type here (for + # performance) inferred_type = lib.infer_dtype(value.ravel()) if empty_array: pass @@ -2061,7 +2127,7 @@ def write_array(self, key, value, items=None): items = list(items) except: pass - ws = performance_doc % (inferred_type,key,items) + ws = performance_doc % (inferred_type, key, items) warnings.warn(ws, PerformanceWarning) vlarr = self._handle.createVLArray(self.group, key, @@ -2078,14 +2144,16 @@ def write_array(self, key, value, items=None): getattr(self.group, key)._v_attrs.transposed = transposed + class LegacyStorer(GenericStorer): def read_index_legacy(self, key): - node = getattr(self.group,key) + node = getattr(self.group, key) data = node[:] kind = node._v_attrs.kind return _unconvert_index_legacy(data, kind, encoding=self.encoding) + class LegacySeriesStorer(LegacyStorer): def read(self, **kwargs): @@ -2094,6 +2162,7 @@ def read(self, **kwargs): values = self.read_array('values') return Series(values, index=index) + class LegacyFrameStorer(LegacyStorer): def read(self, **kwargs): @@ -2103,6 +2172,7 @@ def read(self, **kwargs): values = self.read_array('values') return DataFrame(values, index=index, columns=columns) + class SeriesStorer(GenericStorer): pandas_kind = u('series') attributes = ['name'] @@ -2110,7 +2180,7 @@ class SeriesStorer(GenericStorer): @property def shape(self): try: - return len(getattr(self.group,'values')), + return len(getattr(self.group, 'values')), except: return None @@ -2130,9 +2200,10 @@ def write(self, obj, **kwargs): self.write_array('values', obj.values) self.attrs.name = obj.name + class SparseSeriesStorer(GenericStorer): pandas_kind = u('sparse_series') - attributes = ['name','fill_value','kind'] + attributes = ['name', 'fill_value', 'kind'] def read(self, **kwargs): self.validate_read(kwargs) @@ -2152,9 +2223,10 @@ def write(self, obj, **kwargs): self.attrs.fill_value = obj.fill_value self.attrs.kind = obj.kind + class SparseFrameStorer(GenericStorer): pandas_kind = u('sparse_frame') - attributes = ['default_kind','default_fill_value'] + attributes = ['default_kind', 'default_fill_value'] def read(self, **kwargs): self.validate_read(kwargs) @@ -2162,7 +2234,7 @@ def read(self, **kwargs): sdict = {} for c in columns: key = 'sparse_series_%s' % c - s = SparseSeriesStorer(self.parent, getattr(self.group,key)) + s = SparseSeriesStorer(self.parent, getattr(self.group, key)) s.infer_axes() sdict[c] = s.read() return SparseDataFrame(sdict, columns=columns, @@ -2181,12 +2253,13 @@ def write(self, obj, **kwargs): s = SparseSeriesStorer(self.parent, node) s.write(ss) self.attrs.default_fill_value = obj.default_fill_value - self.attrs.default_kind = obj.default_kind + self.attrs.default_kind = obj.default_kind self.write_index('columns', obj.columns) + class SparsePanelStorer(GenericStorer): pandas_kind = u('sparse_panel') - attributes = ['default_kind','default_fill_value'] + attributes = ['default_kind', 'default_fill_value'] def read(self, **kwargs): self.validate_read(kwargs) @@ -2196,7 +2269,7 @@ def read(self, **kwargs): for name in items: key = 'sparse_frame_%s' % name node = getattr(self.group, key) - s = SparseFrameStorer(self.parent, getattr(self.group,key)) + s = SparseFrameStorer(self.parent, getattr(self.group, key)) s.infer_axes() sdict[name] = s.read() return SparsePanel(sdict, items=items, default_kind=self.default_kind, @@ -2205,7 +2278,7 @@ def read(self, **kwargs): def write(self, obj, **kwargs): super(SparsePanelStorer, self).write(obj, **kwargs) self.attrs.default_fill_value = obj.default_fill_value - self.attrs.default_kind = obj.default_kind + self.attrs.default_kind = obj.default_kind self.write_index('items', obj.items) for name, sdf in compat.iteritems(obj): @@ -2217,8 +2290,9 @@ def write(self, obj, **kwargs): s = SparseFrameStorer(self.parent, node) s.write(sdf) + class BlockManagerStorer(GenericStorer): - attributes = ['ndim','nblocks'] + attributes = ['ndim', 'nblocks'] is_shape_reversed = False @property @@ -2230,15 +2304,15 @@ def shape(self): items = 0 for i in range(self.nblocks): node = getattr(self.group, 'block%d_items' % i) - shape = getattr(node,'shape',None) + shape = getattr(node, 'shape', None) if shape is not None: items += shape[0] # data shape node = getattr(self.group, 'block0_values') - shape = getattr(node,'shape',None) + shape = getattr(node, 'shape', None) if shape is not None: - shape = list(shape[0:(ndim-1)]) + shape = list(shape[0:(ndim - 1)]) else: shape = [] @@ -2288,20 +2362,24 @@ def write(self, obj, **kwargs): self.write_array('block%d_values' % i, blk.values, items=blk.items) self.write_index('block%d_items' % i, blk.items) + class FrameStorer(BlockManagerStorer): pandas_kind = u('frame') - obj_type = DataFrame + obj_type = DataFrame + class PanelStorer(BlockManagerStorer): pandas_kind = u('wide') - obj_type = Panel + obj_type = Panel is_shape_reversed = True def write(self, obj, **kwargs): obj._consolidate_inplace() return super(PanelStorer, self).write(obj, **kwargs) + class Table(Storer): + """ represent a table: facilitate read/write of various types of tables @@ -2319,20 +2397,20 @@ class Table(Storer): """ pandas_kind = u('wide_table') - table_type = None - levels = 1 - is_table = True + table_type = None + levels = 1 + is_table = True is_shape_reversed = False def __init__(self, *args, **kwargs): super(Table, self).__init__(*args, **kwargs) - self.index_axes = [] + self.index_axes = [] self.non_index_axes = [] - self.values_axes = [] - self.data_columns = [] - self.info = dict() - self.nan_rep = None - self.selection = None + self.values_axes = [] + self.data_columns = [] + self.info = dict() + self.nan_rep = None + self.selection = None @property def table_type_short(self): @@ -2341,18 +2419,20 @@ def table_type_short(self): def __unicode__(self): """ return a pretty representatgion of myself """ self.infer_axes() - dc = ",dc->[%s]" % ','.join(self.data_columns) if len(self.data_columns) else '' + dc = ",dc->[%s]" % ','.join( + self.data_columns) if len(self.data_columns) else '' ver = '' if self.is_old_version: - ver = "[%s]" % '.'.join([ str(x) for x in self.version ]) + ver = "[%s]" % '.'.join([str(x) for x in self.version]) return "%-12.12s%s (typ->%s,nrows->%s,ncols->%s,indexers->[%s]%s)" % (self.pandas_type, ver, self.table_type_short, self.nrows, self.ncols, - ','.join([ a.name for a in self.index_axes ]), + ','.join( + [a.name for a in self.index_axes]), dc) def __getitem__(self, c): @@ -2364,25 +2444,28 @@ def __getitem__(self, c): def validate(self, other): """ validate against an existing table """ - if other is None: return + if other is None: + return if other.table_type != self.table_type: raise TypeError("incompatible table_type with existing [%s - %s]" % (other.table_type, self.table_type)) - for c in ['index_axes','non_index_axes','values_axes']: - sv = getattr(self,c,None) - ov = getattr(other,c,None) + for c in ['index_axes', 'non_index_axes', 'values_axes']: + sv = getattr(self, c, None) + ov = getattr(other, c, None) if sv != ov: # show the error for the specific axes for i, sax in enumerate(sv): oax = ov[i] if sax != oax: - raise ValueError("invalid combinate of [%s] on appending data [%s] vs current table [%s]" % (c,sax,oax)) + raise ValueError( + "invalid combinate of [%s] on appending data [%s] vs current table [%s]" % (c, sax, oax)) # should never get here - raise Exception("invalid combinate of [%s] on appending data [%s] vs current table [%s]" % (c,sv,ov)) + raise Exception( + "invalid combinate of [%s] on appending data [%s] vs current table [%s]" % (c, sv, ov)) @property def is_multi_index(self): @@ -2392,7 +2475,7 @@ def is_multi_index(self): @property def nrows_expected(self): """ based on our axes, compute the expected nrows """ - return np.prod([ i.cvalues.shape[0] for i in self.index_axes ]) + return np.prod([i.cvalues.shape[0] for i in self.index_axes]) @property def is_exists(self): @@ -2401,7 +2484,7 @@ def is_exists(self): @property def storable(self): - return getattr(self.group,'table',None) + return getattr(self.group, 'table', None) @property def table(self): @@ -2423,7 +2506,7 @@ def axes(self): @property def ncols(self): """ the number of total columns in the values axes """ - return sum([ len(a.values) for a in self.values_axes ]) + return sum([len(a.values) for a in self.values_axes]) @property def is_transposed(self): @@ -2453,44 +2536,54 @@ def values_cols(self): def set_info(self): """ update our table index info """ - self.attrs.info = self.info + self.attrs.info = self.info def set_attrs(self): """ set our table type & indexables """ - self.attrs.table_type = str(self.table_type) - self.attrs.index_cols = self.index_cols() - self.attrs.values_cols = self.values_cols() + self.attrs.table_type = str(self.table_type) + self.attrs.index_cols = self.index_cols() + self.attrs.values_cols = self.values_cols() self.attrs.non_index_axes = self.non_index_axes self.attrs.data_columns = self.data_columns - self.attrs.nan_rep = self.nan_rep - self.attrs.encoding = self.encoding - self.attrs.levels = self.levels + self.attrs.nan_rep = self.nan_rep + self.attrs.encoding = self.encoding + self.attrs.levels = self.levels self.set_info() def get_attrs(self): """ retrieve our attributes """ - self.non_index_axes = getattr(self.attrs,'non_index_axes',None) or [] - self.data_columns = getattr(self.attrs,'data_columns',None) or [] - self.info = getattr(self.attrs,'info',None) or dict() - self.nan_rep = getattr(self.attrs,'nan_rep',None) - self.encoding = _ensure_encoding(getattr(self.attrs,'encoding',None)) - self.levels = getattr(self.attrs,'levels',None) or [] + self.non_index_axes = getattr( + self.attrs, 'non_index_axes', None) or [] + self.data_columns = getattr( + self.attrs, 'data_columns', None) or [] + self.info = getattr( + self.attrs, 'info', None) or dict() + self.nan_rep = getattr(self.attrs, 'nan_rep', None) + self.encoding = _ensure_encoding( + getattr(self.attrs, 'encoding', None)) + self.levels = getattr( + self.attrs, 'levels', None) or [] t = self.table - self.index_axes = [ a.infer(t) for a in self.indexables if a.is_an_indexable ] - self.values_axes = [ a.infer(t) for a in self.indexables if not a.is_an_indexable ] + self.index_axes = [a.infer(t) + for a in self.indexables if a.is_an_indexable] + self.values_axes = [a.infer(t) + for a in self.indexables if not a.is_an_indexable] - def validate_version(self, where = None): + def validate_version(self, where=None): """ are we trying to operate on an old version? """ if where is not None: if self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1: - ws = incompatibility_doc % '.'.join([ str(x) for x in self.version ]) + ws = incompatibility_doc % '.'.join( + [str(x) for x in self.version]) warnings.warn(ws, IncompatibilityWarning) def validate_min_itemsize(self, min_itemsize): """ validate the min_itemisze doesn't contain items that are not in the axes this needs data_columns to be defined """ - if min_itemsize is None: return - if not isinstance(min_itemsize, dict): return + if min_itemsize is None: + return + if not isinstance(min_itemsize, dict): + return q = self.queryables() for k, v in min_itemsize.items(): @@ -2499,7 +2592,8 @@ def validate_min_itemsize(self, min_itemsize): if k == 'values': continue if k not in q: - raise ValueError("min_itemsize has the key [%s] which is not an axis or data_column" % k) + raise ValueError( + "min_itemsize has the key [%s] which is not an axis or data_column" % k) @property def indexables(self): @@ -2510,7 +2604,8 @@ def indexables(self): self._indexables = [] # index columns - self._indexables.extend([ IndexCol(name=name,axis=axis,pos=i) for i, (axis, name) in enumerate(self.attrs.index_cols)]) + self._indexables.extend([IndexCol(name=name, axis=axis, pos=i) + for i, (axis, name) in enumerate(self.attrs.index_cols)]) # values columns dc = set(self.data_columns) @@ -2628,10 +2723,11 @@ def validate_data_columns(self, data_columns, min_itemsize): data_columns = [] # if min_itemsize is a dict, add the keys (exclude 'values') - if isinstance(min_itemsize,dict): + if isinstance(min_itemsize, dict): existing_data_columns = set(data_columns) - data_columns.extend([ k for k in min_itemsize.keys() if k != 'values' and k not in existing_data_columns ]) + data_columns.extend( + [k for k in min_itemsize.keys() if k != 'values' and k not in existing_data_columns]) # return valid columns in the order of our axis return [c for c in data_columns if c in axis_labels] @@ -2657,8 +2753,9 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, try: axes = _AXES_MAP[type(obj)] except: - raise TypeError("cannot properly create the storer for: [group->%s,value->%s]" % - (self.group._v_name,type(obj))) + raise TypeError( + "cannot properly create the storer for: [group->%s,value->%s]" % + (self.group._v_name, type(obj))) # map axes to numbers axes = [obj._get_axis_number(a) for a in axes] @@ -2667,17 +2764,18 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, if self.infer_axes(): existing_table = self.copy() existing_table.infer_axes() - axes = [ a.axis for a in existing_table.index_axes] - data_columns = existing_table.data_columns - nan_rep = existing_table.nan_rep + axes = [a.axis for a in existing_table.index_axes] + data_columns = existing_table.data_columns + nan_rep = existing_table.nan_rep self.encoding = existing_table.encoding - self.info = copy.copy(existing_table.info) + self.info = copy.copy(existing_table.info) else: existing_table = None # currently support on ndim-1 axes if len(axes) != self.ndim - 1: - raise ValueError("currently only support ndim-1 indexers in an AppendableTable") + raise ValueError( + "currently only support ndim-1 indexers in an AppendableTable") # create according to the new data self.non_index_axes = [] @@ -2714,7 +2812,8 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, self.non_index_axes.append((i, append_axis)) # set axis positions (based on the axes) - self.index_axes = [index_axes_map[a].set_pos(j).update_info(self.info) for j, + self.index_axes = [index_axes_map[a].set_pos( + j).update_info(self.info) for j, a in enumerate(axes)] j = len(self.index_axes) @@ -2734,7 +2833,8 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, blocks = block_obj._data.blocks if len(self.non_index_axes): axis, axis_labels = self.non_index_axes[0] - data_columns = self.validate_data_columns(data_columns, min_itemsize) + data_columns = self.validate_data_columns( + data_columns, min_itemsize) if len(data_columns): blocks = block_obj.reindex_axis(Index(axis_labels) - Index( data_columns), axis=axis)._data.blocks @@ -2744,7 +2844,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, # reorder the blocks in the same order as the existing_table if we can if existing_table is not None: - by_items = dict([ (tuple(b.items.tolist()),b) for b in blocks ]) + by_items = dict([(tuple(b.items.tolist()), b) for b in blocks]) new_blocks = [] for ea in existing_table.values_axes: items = tuple(ea.values) @@ -2752,7 +2852,8 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, b = by_items.pop(items) new_blocks.append(b) except: - raise ValueError("cannot match existing table structure for [%s] on appending data" % ','.join(items)) + raise ValueError( + "cannot match existing table structure for [%s] on appending data" % ','.join(items)) blocks = new_blocks # add my values @@ -2775,8 +2876,9 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, try: existing_col = existing_table.values_axes[i] except: - raise ValueError("Incompatible appended table [%s] with existing table [%s]" % - (blocks,existing_table.values_axes)) + raise ValueError( + "Incompatible appended table [%s] with existing table [%s]" % + (blocks, existing_table.values_axes)) else: existing_col = None @@ -2796,7 +2898,8 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, except (NotImplementedError, ValueError, TypeError) as e: raise e except (Exception) as detail: - raise Exception("cannot find the correct atom type -> [dtype->%s,items->%s] %s" % (b.dtype.name, b.items, str(detail))) + raise Exception("cannot find the correct atom type -> [dtype->%s,items->%s] %s" % ( + b.dtype.name, b.items, str(detail))) j += 1 # validate our min_itemsize @@ -2829,23 +2932,24 @@ def process_filter(field, filt): # see if the field is the name of an axis if field == axis_name: - takers = op(axis_values,filt) - return obj.ix._getitem_axis(takers,axis=axis_number) + takers = op(axis_values, filt) + return obj.ix._getitem_axis(takers, axis=axis_number) # this might be the name of a file IN an axis elif field in axis_values: # we need to filter on this dimension - values = _ensure_index(getattr(obj,field).values) - filt = _ensure_index(filt) + values = _ensure_index(getattr(obj, field).values) + filt = _ensure_index(filt) # hack until we support reversed dim flags - if isinstance(obj,DataFrame): - axis_number = 1-axis_number - takers = op(values,filt) - return obj.ix._getitem_axis(takers,axis=axis_number) + if isinstance(obj, DataFrame): + axis_number = 1 - axis_number + takers = op(values, filt) + return obj.ix._getitem_axis(takers, axis=axis_number) - raise ValueError("cannot find the field [%s] for filtering!" % field) + raise ValueError( + "cannot find the field [%s] for filtering!" % field) obj = process_filter(field, filt) @@ -2885,10 +2989,11 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs): return False # create the selection - self.selection = Selection(self, where=where, start=start, stop=stop, **kwargs) + self.selection = Selection( + self, where=where, start=start, stop=stop, **kwargs) return Coordinates(self.selection.select_coords(), group=self.group, where=where) - def read_column(self, column, where = None, **kwargs): + def read_column(self, column, where=None, **kwargs): """ return a single column from the table, generally only indexables are interesting """ # validate the version @@ -2899,14 +3004,16 @@ def read_column(self, column, where = None, **kwargs): return False if where is not None: - raise Exception("read_column does not currently accept a where clause") + raise Exception( + "read_column does not currently accept a where clause") # find the axes for a in self.axes: if column == a.name: if not a.is_data_indexable: - raise ValueError("column [%s] can not be extracted individually; it is not data indexable" % column) + raise ValueError( + "column [%s] can not be extracted individually; it is not data indexable" % column) # column must be an indexable or a data column c = getattr(self.table.cols, column) @@ -2915,7 +3022,9 @@ def read_column(self, column, where = None, **kwargs): raise KeyError("column [%s] not found in the table" % column) + class WORMTable(Table): + """ a write-once read-many table: this format DOES NOT ALLOW appending to a table. writing is a one-time operation the data are stored in a format that allows for searching the data on disk @@ -2935,6 +3044,7 @@ def write(self, **kwargs): class LegacyTable(Table): + """ an appendable table: allow append/query/delete operations to a (possibily) already existing appendable table this table ALLOWS append (but doesn't require them), and stores the data in a format @@ -3032,6 +3142,7 @@ def read(self, where=None, columns=None, **kwargs): class LegacyFrameTable(LegacyTable): + """ support the legacy frame table """ pandas_kind = u('frame_table') table_type = u('legacy_frame') @@ -3042,12 +3153,14 @@ def read(self, *args, **kwargs): class LegacyPanelTable(LegacyTable): + """ support the legacy panel table """ table_type = u('legacy_panel') obj_type = Panel class AppendableTable(LegacyTable): + """ suppor the new appendable table formats """ _indexables = None table_type = u('appendable') @@ -3114,26 +3227,29 @@ def write_data(self, chunksize): mask = mask.ravel() # broadcast the indexes if needed - indexes = [ a.cvalues for a in self.index_axes ] + indexes = [a.cvalues for a in self.index_axes] nindexes = len(indexes) bindexes = [] for i, idx in enumerate(indexes): # broadcast to all other indexes except myself if i > 0 and i < nindexes: - repeater = np.prod([indexes[bi].shape[0] for bi in range(0,i)]) - idx = np.tile(idx,repeater) + repeater = np.prod( + [indexes[bi].shape[0] for bi in range(0, i)]) + idx = np.tile(idx, repeater) - if i < nindexes-1: - repeater = np.prod([indexes[bi].shape[0] for bi in range(i+1,nindexes)]) - idx = np.repeat(idx,repeater) + if i < nindexes - 1: + repeater = np.prod([indexes[bi].shape[0] + for bi in range(i + 1, nindexes)]) + idx = np.repeat(idx, repeater) bindexes.append(idx) # transpose the values so first dimension is last # reshape the values if needed - values = [ a.take_data() for a in self.values_axes] - values = [ v.transpose(np.roll(np.arange(v.ndim),v.ndim-1)) for v in values ] + values = [a.take_data() for a in self.values_axes] + values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) + for v in values] bvalues = [] for i, v in enumerate(values): new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape @@ -3164,7 +3280,7 @@ def write_data_chunk(self, indexes, mask, values): try: nrows = indexes[0].shape[0] - rows = np.empty(nrows,dtype=self.dtype) + rows = np.empty(nrows, dtype=self.dtype) names = self.dtype.names nindexes = len(indexes) @@ -3174,7 +3290,7 @@ def write_data_chunk(self, indexes, mask, values): # values for i, v in enumerate(values): - rows[names[i+nindexes]] = v + rows[names[i + nindexes]] = v # mask rows = rows[~mask.ravel().astype(bool)] @@ -3187,7 +3303,8 @@ def write_data_chunk(self, indexes, mask, values): self.table.append(rows) self.table.flush() except Exception as detail: - raise Exception("tables cannot write this data -> %s" % str(detail)) + raise Exception( + "tables cannot write this data -> %s" % str(detail)) def delete(self, where=None, **kwargs): @@ -3243,6 +3360,7 @@ def delete(self, where=None, **kwargs): class AppendableFrameTable(AppendableTable): + """ suppor the new appendable table formats """ pandas_kind = u('frame_table') table_type = u('appendable_frame') @@ -3272,10 +3390,10 @@ def read(self, where=None, columns=None, **kwargs): if self.is_transposed: values = a.cvalues index_ = cols - cols_ = Index(index,name=getattr(index,'name',None)) + cols_ = Index(index, name=getattr(index, 'name', None)) else: values = a.cvalues.T - index_ = Index(index,name=getattr(index,'name',None)) + index_ = Index(index, name=getattr(index, 'name', None)) cols_ = cols # if we have a DataIndexableCol, its shape will only be 1 dim @@ -3354,6 +3472,7 @@ def write(self, obj, **kwargs): return super(AppendableMultiSeriesTable, self).write(obj=obj, **kwargs) class GenericTable(AppendableFrameTable): + """ a table that read/writes the generic pytables table format """ pandas_kind = u('frame_table') table_type = u('generic_table') @@ -3366,17 +3485,19 @@ def pandas_type(self): @property def storable(self): - return getattr(self.group,'table',None) or self.group + return getattr(self.group, 'table', None) or self.group def get_attrs(self): """ retrieve our attributes """ - self.non_index_axes = [] - self.nan_rep = None - self.levels = [] + self.non_index_axes = [] + self.nan_rep = None + self.levels = [] t = self.table - self.index_axes = [ a.infer(t) for a in self.indexables if a.is_an_indexable ] - self.values_axes = [ a.infer(t) for a in self.indexables if not a.is_an_indexable ] - self.data_columns = [ a.name for a in self.values_axes ] + self.index_axes = [a.infer(t) + for a in self.indexables if a.is_an_indexable] + self.values_axes = [a.infer(t) + for a in self.indexables if not a.is_an_indexable] + self.data_columns = [a.name for a in self.values_axes] @property def indexables(self): @@ -3386,11 +3507,12 @@ def indexables(self): d = self.description # the index columns is just a simple index - self._indexables = [ GenericIndexCol(name='index',axis=0) ] + self._indexables = [GenericIndexCol(name='index', axis=0)] for i, n in enumerate(d._v_names): - dc = GenericDataIndexableCol(name = n, pos=i, values = [ n ], version = self.version) + dc = GenericDataIndexableCol( + name=n, pos=i, values=[n], version = self.version) self._indexables.append(dc) return self._indexables @@ -3398,7 +3520,9 @@ def indexables(self): def write(self, **kwargs): raise NotImplementedError("cannot write on an generic table") + class AppendableMultiFrameTable(AppendableFrameTable): + """ a frame with a multi-index """ table_type = u('appendable_multiframe') obj_type = DataFrame @@ -3424,12 +3548,14 @@ def read(self, columns=None, **kwargs): for n in self.levels: if n not in columns: columns.insert(0, n) - df = super(AppendableMultiFrameTable, self).read(columns=columns, **kwargs) + df = super(AppendableMultiFrameTable, self).read( + columns=columns, **kwargs) df.set_index(self.levels, inplace=True) return df class AppendablePanelTable(AppendableTable): + """ suppor the new appendable table formats """ table_type = u('appendable_panel') ndim = 3 @@ -3447,22 +3573,25 @@ def is_transposed(self): class AppendableNDimTable(AppendablePanelTable): + """ suppor the new appendable table formats """ table_type = u('appendable_ndim') ndim = 4 obj_type = Panel4D + def _convert_index(index, encoding=None): - index_name = getattr(index,'name',None) + index_name = getattr(index, 'name', None) if isinstance(index, DatetimeIndex): converted = index.asi8 return IndexCol(converted, 'datetime64', _tables().Int64Col(), - freq=getattr(index,'freq',None), tz=getattr(index,'tz',None), + freq=getattr(index, 'freq', None), tz=getattr(index, 'tz', None), index_name=index_name) elif isinstance(index, (Int64Index, PeriodIndex)): atom = _tables().Int64Col() - return IndexCol(index.values, 'integer', atom, freq=getattr(index,'freq',None), + return IndexCol( + index.values, 'integer', atom, freq=getattr(index, 'freq', None), index_name=index_name) if isinstance(index, MultiIndex): @@ -3475,7 +3604,7 @@ def _convert_index(index, encoding=None): if inferred_type == 'datetime64': converted = values.view('i8') return IndexCol(converted, 'datetime64', _tables().Int64Col(), - freq=getattr(index,'freq',None), tz=getattr(index,'tz',None), + freq=getattr(index, 'freq', None), tz=getattr(index, 'tz', None), index_name=index_name) elif inferred_type == 'datetime': converted = np.array([(time.mktime(v.timetuple()) + @@ -3494,7 +3623,8 @@ def _convert_index(index, encoding=None): converted = _convert_string_array(values, encoding) itemsize = converted.dtype.itemsize - return IndexCol(converted, 'string', _tables().StringCol(itemsize), itemsize=itemsize, + return IndexCol( + converted, 'string', _tables().StringCol(itemsize), itemsize=itemsize, index_name=index_name) elif inferred_type == 'unicode': atom = _tables().ObjectAtom() @@ -3514,6 +3644,7 @@ def _convert_index(index, encoding=None): return IndexCol(np.asarray(values, dtype='O'), 'object', atom, index_name=index_name) + def _unconvert_index(data, kind, encoding=None): kind = _ensure_decoded(kind) if kind == u('datetime64'): @@ -3533,6 +3664,7 @@ def _unconvert_index(data, kind, encoding=None): raise ValueError('unrecognized index type %s' % kind) return index + def _unconvert_index_legacy(data, kind, legacy=False, encoding=None): kind = _ensure_decoded(kind) if kind == u('datetime'): @@ -3545,6 +3677,7 @@ def _unconvert_index_legacy(data, kind, legacy=False, encoding=None): raise ValueError('unrecognized index type %s' % kind) return index + def _convert_string_array(data, encoding, itemsize=None): # encode if needed @@ -3556,19 +3689,20 @@ def _convert_string_array(data, encoding, itemsize=None): if itemsize is None: itemsize = lib.max_len_string_array(com._ensure_object(data.ravel())) - data = np.array(data,dtype="S%d" % itemsize) + data = np.array(data, dtype="S%d" % itemsize) return data + def _unconvert_string_array(data, nan_rep=None, encoding=None): """ deserialize a string array, possibly decoding """ shape = data.shape - data = np.array(data.ravel(),dtype=object) + data = np.array(data.ravel(), dtype=object) # guard against a None encoding in PY3 (because of a legacy # where the passed encoding is actually None) encoding = _ensure_encoding(encoding) if encoding is not None and len(data): - f = np.vectorize(lambda x: x.decode(encoding),otypes=[np.object]) + f = np.vectorize(lambda x: x.decode(encoding), otypes=[np.object]) data = f(data) if nan_rep is None: @@ -3577,6 +3711,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): data = lib.string_array_replace_from_nan_rep(data, nan_rep) return data.reshape(shape) + def _maybe_convert(values, val_kind, encoding): if _need_convert(val_kind): conv = _get_converter(val_kind, encoding) @@ -3584,6 +3719,7 @@ def _maybe_convert(values, val_kind, encoding): values = conv(values) return values + def _get_converter(kind, encoding): kind = _ensure_decoded(kind) if kind == 'datetime64': @@ -3591,17 +3727,20 @@ def _get_converter(kind, encoding): elif kind == 'datetime': return lib.convert_timestamps elif kind == 'string': - return lambda x: _unconvert_string_array(x,encoding=encoding) + return lambda x: _unconvert_string_array(x, encoding=encoding) else: # pragma: no cover raise ValueError('invalid kind %s' % kind) + def _need_convert(kind): kind = _ensure_decoded(kind) if kind in (u('datetime'), u('datetime64'), u('string')): return True return False + class Term(StringMixin): + """create a term object that holds a field, op, and value Parameters @@ -3629,7 +3768,8 @@ class Term(StringMixin): """ _ops = ['<=', '<', '>=', '>', '!=', '==', '='] - _search = re.compile("^\s*(?P\w+)\s*(?P%s)\s*(?P.+)\s*$" % '|'.join(_ops)) + _search = re.compile( + "^\s*(?P\w+)\s*(?P%s)\s*(?P.+)\s*$" % '|'.join(_ops)) _max_selectors = 31 def __init__(self, field, op=None, value=None, queryables=None, encoding=None): @@ -3697,8 +3837,9 @@ def __init__(self, field, op=None, value=None, queryables=None, encoding=None): # we have valid conditions if self.op in ['>', '>=', '<', '<=']: - if hasattr(self.value, '__iter__') and len(self.value) > 1 and not isinstance(self.value,compat.string_types): - raise ValueError("an inequality condition cannot have multiple values [%s]" % str(self)) + if hasattr(self.value, '__iter__') and len(self.value) > 1 and not isinstance(self.value, compat.string_types): + raise ValueError( + "an inequality condition cannot have multiple values [%s]" % str(self)) if not is_list_like(self.value): self.value = [self.value] @@ -3740,7 +3881,7 @@ def eval(self): if self.is_in_table: values = [self.convert_value(v) for v in self.value] else: - values = [TermValue(v,v,self.kind) for v in self.value] + values = [TermValue(v, v, self.kind) for v in self.value] # equality conditions if self.op in ['==', '!=']: @@ -3751,21 +3892,22 @@ def eval(self): else: filter_op = lambda axis, vals: axis.isin(vals) - if self.is_in_table: # too many values to create the expression? if len(values) <= self._max_selectors: - vs = [ self.generate(v) for v in values ] + vs = [self.generate(v) for v in values] self.condition = "(%s)" % ' | '.join(vs) # use a filter after reading else: - self.filter = (self.field, filter_op, Index([v.value for v in values])) + self.filter = ( + self.field, filter_op, Index([v.value for v in values])) else: - self.filter = (self.field, filter_op, Index([v.value for v in values])) + self.filter = ( + self.field, filter_op, Index([v.value for v in values])) else: @@ -3775,7 +3917,8 @@ def eval(self): else: - raise TypeError("passing a filterable condition to a non-table indexer [%s]" % str(self)) + raise TypeError( + "passing a filterable condition to a non-table indexer [%s]" % str(self)) def convert_value(self, v): """ convert the expression that is in the term to something that is accepted by pytables """ @@ -3791,17 +3934,17 @@ def stringify(value): v = lib.Timestamp(v) if v.tz is not None: v = v.tz_convert('UTC') - return TermValue(v,v.value,kind) + return TermValue(v, v.value, kind) elif (isinstance(v, datetime) or hasattr(v, 'timetuple') or kind == u('date')): v = time.mktime(v.timetuple()) - return TermValue(v,Timestamp(v),kind) + return TermValue(v, Timestamp(v), kind) elif kind == u('integer'): v = int(float(v)) - return TermValue(v,v,kind) + return TermValue(v, v, kind) elif kind == u('float'): v = float(v) - return TermValue(v,v,kind) + return TermValue(v, v, kind) elif kind == u('bool'): if isinstance(v, compat.string_types): poss_vals = [u('false'), u('f'), u('no'), @@ -3810,15 +3953,17 @@ def stringify(value): v = not v.strip().lower() in poss_vals else: v = bool(v) - return TermValue(v,v,kind) + return TermValue(v, v, kind) elif not isinstance(v, compat.string_types): v = stringify(v) - return TermValue(v,stringify(v),u('string')) + return TermValue(v, stringify(v), u('string')) # string quoting - return TermValue(v,stringify(v),u('string')) + return TermValue(v, stringify(v), u('string')) + class TermValue(object): + """ hold a term value the we use to construct a condition/filter """ def __init__(self, value, converted, kind): @@ -3835,7 +3980,9 @@ def tostring(self, encoding): return '"%s"' % self.converted return self.converted + class Coordinates(object): + """ holds a returned coordinates list, useful to select the same rows from different tables coordinates : holds the array of coordinates @@ -3855,7 +4002,9 @@ def __getitem__(self, key): """ return a new coordinates object, sliced by the key """ return Coordinates(self.values[key], self.group, self.where) + class Selection(object): + """ Carries out a selection operation on a tables.Table object. @@ -3866,6 +4015,7 @@ class Selection(object): start, stop: indicies to start and/or stop selection """ + def __init__(self, table, where=None, start=None, stop=None, **kwargs): self.table = table self.where = where @@ -3885,7 +4035,7 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs): # see if we have a passed coordinate like try: inferred = lib.infer_dtype(where) - if inferred=='integer' or inferred=='boolean': + if inferred == 'integer' or inferred == 'boolean': where = np.array(where) if where.dtype == np.bool_: start, stop = self.start, self.stop @@ -3893,10 +4043,11 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs): start = 0 if stop is None: stop = self.table.nrows - self.coordinates = np.arange(start,stop)[where] - elif issubclass(where.dtype.type,np.integer): - if (self.start is not None and (where=self.stop).any()): - raise ValueError("where must have index locations >= start and < stop") + self.coordinates = np.arange(start, stop)[where] + elif issubclass(where.dtype.type, np.integer): + if (self.start is not None and (where < self.start).any()) or (self.stop is not None and (where >= self.stop).any()): + raise ValueError( + "where must have index locations >= start and < stop") self.coordinates = where except: @@ -3908,9 +4059,10 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs): # create the numexpr & the filter if self.terms: - terms = [ t for t in self.terms if t.condition is not None ] + terms = [t for t in self.terms if t.condition is not None] if len(terms): - self.condition = "(%s)" % ' & '.join([ t.condition for t in terms ]) + self.condition = "(%s)" % ' & '.join( + [t.condition for t in terms]) self.filter = [] for t in self.terms: if t.filter is not None: @@ -3955,13 +4107,13 @@ def select_coords(self): return self.table.table.getWhereList(self.condition, start=self.start, stop=self.stop, sort=True) -### utilities ### +# utilities ### -def timeit(key,df,fn=None,remove=True,**kwargs): +def timeit(key, df, fn=None, remove=True, **kwargs): if fn is None: fn = 'timeit.h5' - store = HDFStore(fn,mode='w') - store.append(key,df,**kwargs) + store = HDFStore(fn, mode='w') + store.append(key, df, **kwargs) store.close() if remove: From 952a342773fba73925f1be127bf340cd0459f9b1 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 22 Aug 2013 18:01:02 -0400 Subject: [PATCH 3/3] API: the ``fmt`` keyword now replaces the table keyword; allowed values are ``s|t`` the same defaults as prior < 0.13.0 remain, e.g. ``put`` implies 's' (Storer) format and ``append`` imples 't' (Table) format --- doc/source/io.rst | 5 +- doc/source/release.rst | 1 + doc/source/v0.13.0.txt | 20 ++++++++ pandas/io/pytables.py | 86 ++++++++++++++++++++++++------- pandas/io/tests/test_pytables.py | 87 +++++++++++++++++++++++--------- 5 files changed, 157 insertions(+), 42 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index f09ae1563f71b..73a7c2d1e1121 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1803,6 +1803,7 @@ The examples above show storing using ``put``, which write the HDF5 to ``PyTable the ``storer`` format. These types of stores are are **not** appendable once written (though you can simply remove them and rewrite). Nor are they **queryable**; they must be retrieved in their entirety. These offer very fast writing and slightly faster reading than ``table`` stores. +This format is specified by default when using ``put`` or by ``fmt='s'`` .. warning:: @@ -1826,7 +1827,7 @@ Table Format format. Conceptually a ``table`` is shaped very much like a DataFrame, with rows and columns. A ``table`` may be appended to in the same or other sessions. In addition, delete & query type operations are -supported. +supported. This format is specified by ``fmt='t'`` to ``append`` or ``put``. .. ipython:: python :suppress: @@ -1853,7 +1854,7 @@ supported. .. note:: - You can also create a ``table`` by passing ``table=True`` to a ``put`` operation. + You can also create a ``table`` by passing ``fmt='t'`` to a ``put`` operation. .. _io.hdf5-keys: diff --git a/doc/source/release.rst b/doc/source/release.rst index 8400dab2a70c7..21fea6b64c042 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -98,6 +98,7 @@ pandas 0.13 - removed the ``warn`` argument from ``open``. Instead a ``PossibleDataLossError`` exception will be raised if you try to use ``mode='w'`` with an OPEN file handle (:issue:`4367`) - allow a passed locations array or mask as a ``where`` condition (:issue:`4467`) + - the ``fmt`` keyword now replaces the ``table`` keyword; allowed values are ``s|t`` - ``JSON`` - added ``date_unit`` parameter to specify resolution of timestamps. Options diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index 5003aa654d9fb..c0e3f5e6b2f10 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -66,6 +66,12 @@ API changes store2.close() store2 + .. ipython:: python + :suppress: + + import os + os.remove(path) + - removed the ``_quiet`` attribute, replace by a ``DuplicateWarning`` if retrieving duplicate rows from a table (:issue:`4367`) - removed the ``warn`` argument from ``open``. Instead a ``PossibleDataLossError`` exception will @@ -73,6 +79,20 @@ API changes - allow a passed locations array or mask as a ``where`` condition (:issue:`4467`). See :ref:`here` for an example. + - the ``fmt`` keyword now replaces the ``table`` keyword; allowed values are ``s|t`` + the same defaults as prior < 0.13.0 remain, e.g. ``put`` implies 's' (Storer) format + and ``append`` imples 't' (Table) format + + .. ipython:: python + + path = 'test.h5' + df = DataFrame(randn(10,2)) + df.to_hdf(path,'df_table',fmt='t') + df.to_hdf(path,'df_table2',append=True) + df.to_hdf(path,'df_storer') + with get_store(path) as store: + print store + .. ipython:: python :suppress: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 0d944afad4d19..1eb8b0f266f68 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -100,6 +100,23 @@ class DuplicateWarning(Warning): map directly to c-types [inferred_type->%s,key->%s] [items->%s] """ +# formats +_FORMAT_MAP = { + u('s') : 's', + u('storer') : 's', + u('t') : 't', + u('table') : 't', + } + +fmt_deprecate_doc = """ +the table keyword has been deprecated +use the fmt='s|t' keyword instead + s : specifies the Storer format + and is the default for put operations + t : specifies the Table format + and is the default for append operations +""" + # map object types _TYPE_MAP = { @@ -545,7 +562,7 @@ def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs def unique(self, key, column, **kwargs): warnings.warn("unique(key,column) is deprecated\n" - "use select_column(key,column).unique() instead") + "use select_column(key,column).unique() instead",FutureWarning) return self.get_storer(key).read_column(column=column, **kwargs).unique() def select_column(self, key, column, **kwargs): @@ -641,7 +658,7 @@ def func(_start, _stop): return TableIterator(self, func, nrows=nrows, start=start, stop=stop, auto_close=auto_close).get_values() - def put(self, key, value, table=None, append=False, **kwargs): + def put(self, key, value, fmt=None, append=False, **kwargs): """ Store object in HDFStore @@ -649,16 +666,20 @@ def put(self, key, value, table=None, append=False, **kwargs): ---------- key : object value : {Series, DataFrame, Panel} - table : boolean, default False - Write as a PyTables Table structure which may perform worse but - allow more flexible operations like searching / selecting subsets - of the data + fmt : 's|t', default is 's' for storer format + s : storer format + Fast writing/reading. Not-appendable, nor searchable + t : table format + Write as a PyTables Table structure which may perform worse but + allow more flexible operations like searching / selecting subsets + of the data append : boolean, default False For table data structures, append the input data to the existing table encoding : default None, provide an encoding for strings """ - self._write_to_group(key, value, table=table, append=append, **kwargs) + kwargs = self._validate_format(fmt or 's', kwargs) + self._write_to_group(key, value, append=append, **kwargs) def remove(self, key, where=None, start=None, stop=None): """ @@ -709,7 +730,7 @@ def remove(self, key, where=None, start=None, stop=None): 'can only remove with where on objects written as tables') return s.delete(where=where, start=start, stop=stop) - def append(self, key, value, columns=None, append=True, **kwargs): + def append(self, key, value, fmt=None, append=True, columns=None, **kwargs): """ Append to Table in file. Node must already exist and be Table format. @@ -718,6 +739,11 @@ def append(self, key, value, columns=None, append=True, **kwargs): ---------- key : object value : {Series, DataFrame, Panel, Panel4D} + fmt : 't', default is 't' for table format + t : table format + Write as a PyTables Table structure which may perform worse but + allow more flexible operations like searching / selecting subsets + of the data append : boolean, default True, append the input data to the existing data_columns : list of columns to create as data columns, or True to use all columns min_itemsize : dict of columns that specify minimum string sizes @@ -735,7 +761,7 @@ def append(self, key, value, columns=None, append=True, **kwargs): raise Exception( "columns is not a supported keyword in append, try data_columns") - kwargs['table'] = True + kwargs = self._validate_format(fmt or 't', kwargs) self._write_to_group(key, value, append=append, **kwargs) def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, **kwargs): @@ -901,13 +927,39 @@ def _check_if_open(self): if not self.is_open: raise ClosedFileError("{0} file is not open!".format(self._path)) - def _create_storer(self, group, value=None, table=False, append=False, **kwargs): + def _validate_format(self, fmt, kwargs): + """ validate / deprecate formats; return the new kwargs """ + kwargs = kwargs.copy() + + if 'format' in kwargs: + raise TypeError("pls specify an object format with the 'fmt' keyword") + + # table arg + table = kwargs.pop('table',None) + + if table is not None: + warnings.warn(fmt_deprecate_doc,FutureWarning) + + if table: + fmt = 't' + else: + fmt = 's' + + # validate + try: + kwargs['fmt'] = _FORMAT_MAP[fmt.lower()] + except: + raise TypeError("invalid HDFStore format specified [{0}]".format(fmt)) + + return kwargs + + def _create_storer(self, group, fmt=None, value=None, append=False, **kwargs): """ return a suitable Storer class to operate """ def error(t): raise TypeError( - "cannot properly create the storer for: [%s] [group->%s,value->%s,table->%s,append->%s,kwargs->%s]" % - (t, group, type(value), table, append, kwargs)) + "cannot properly create the storer for: [%s] [group->%s,value->%s,fmt->%s,append->%s,kwargs->%s]" % + (t, group, type(value), fmt, append, kwargs)) pt = _ensure_decoded(getattr(group._v_attrs, 'pandas_type', None)) tt = _ensure_decoded(getattr(group._v_attrs, 'table_type', None)) @@ -931,7 +983,7 @@ def error(t): error('_TYPE_MAP') # we are actually a table - if table or append: + if fmt == 't': pt += u('_table') # a storer node @@ -983,7 +1035,7 @@ def error(t): error('_TABLE_MAP') def _write_to_group( - self, key, value, index=True, table=False, append=False, + self, key, value, fmt, index=True, append=False, complib=None, encoding=None, **kwargs): group = self.get_node(key) @@ -994,7 +1046,7 @@ def _write_to_group( # we don't want to store a table node at all if are object is 0-len # as there are not dtypes - if getattr(value,'empty',None) and (table or append): + if getattr(value,'empty',None) and (fmt == 't' or append): return if group is None: @@ -1014,12 +1066,12 @@ def _write_to_group( group = self._handle.createGroup(path, p) path = new_path - s = self._create_storer(group, value, table=table, append=append, + s = self._create_storer(group, fmt, value, append=append, encoding=encoding, **kwargs) if append: # raise if we are trying to append to a non-table, # or a table that exists (and we are putting) - if not s.is_table or (s.is_table and table is None and s.is_exists): + if not s.is_table or (s.is_table and fmt == 's' and s.is_exists): raise ValueError('Can only append to Tables') if not s.is_exists: s.set_object_info() diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 0de50960190e9..e2d9235510f83 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -163,12 +163,33 @@ def test_api(self): df.iloc[10:].to_hdf(path,'df',append=True,table=True) assert_frame_equal(read_hdf(path,'df'),df) + with tm.ensure_clean(self.path) as path: + + df = tm.makeDataFrame() + df.iloc[:10].to_hdf(path,'df',append=True) + df.iloc[10:].to_hdf(path,'df',append=True,table='t') + assert_frame_equal(read_hdf(path,'df'),df) + + # append to False + df.iloc[:10].to_hdf(path,'df',append=False,table='t') + df.iloc[10:].to_hdf(path,'df',append=True) + assert_frame_equal(read_hdf(path,'df'),df) + with tm.ensure_clean(self.path) as path: df = tm.makeDataFrame() df.to_hdf(path,'df',append=False,table=False) assert_frame_equal(read_hdf(path,'df'),df) + df.to_hdf(path,'df',append=False,fmt='s') + assert_frame_equal(read_hdf(path,'df'),df) + + df.to_hdf(path,'df',append=False) + assert_frame_equal(read_hdf(path,'df'),df) + + df.to_hdf(path,'df') + assert_frame_equal(read_hdf(path,'df'),df) + with ensure_clean(self.path) as store: df = tm.makeDataFrame() @@ -181,6 +202,26 @@ def test_api(self): store.append('df',df.iloc[10:],append=True,table=True) assert_frame_equal(read_hdf(path,'df'),df) + # formats + store.append('df',df.iloc[:10],append=False,fmt='t') + store.append('df',df.iloc[10:],append=True,fmt='t') + assert_frame_equal(read_hdf(path,'df'),df) + + _maybe_remove(store,'df') + store.append('df',df.iloc[:10],append=False,fmt='t') + store.append('df',df.iloc[10:],append=True,fmt=None) + assert_frame_equal(read_hdf(path,'df'),df) + + with tm.ensure_clean(self.path) as path: + + # invalid + df = tm.makeDataFrame() + self.assertRaises(ValueError, df.to_hdf, path,'df',append=True,fmt='s') + + self.assertRaises(TypeError, df.to_hdf, path,'df',append=True,fmt='foo') + self.assertRaises(TypeError, df.to_hdf, path,'df',append=False,fmt='bar') + self.assertRaises(TypeError, df.to_hdf, path,'df',format='s') + def test_keys(self): with ensure_clean(self.path) as store: @@ -1705,7 +1746,7 @@ def test_remove_where(self): # try to remove non-table (with crit) # non-table ok (where = None) wp = tm.makePanel() - store.put('wp', wp, table=True) + store.put('wp', wp, fmt='t') store.remove('wp', [('minor_axis', ['A', 'D'])]) rs = store.select('wp') expected = wp.reindex(minor_axis=['B', 'C']) @@ -1713,7 +1754,7 @@ def test_remove_where(self): # empty where _maybe_remove(store, 'wp') - store.put('wp', wp, table=True) + store.put('wp', wp, fmt='t') # deleted number (entire table) n = store.remove('wp', []) @@ -1721,12 +1762,12 @@ def test_remove_where(self): # non - empty where _maybe_remove(store, 'wp') - store.put('wp', wp, table=True) + store.put('wp', wp, fmt='t') self.assertRaises(ValueError, store.remove, 'wp', ['foo']) # selectin non-table with a where - # store.put('wp2', wp, table=False) + # store.put('wp2', wp, fmt='s') # self.assertRaises(ValueError, store.remove, # 'wp2', [('column', ['A', 'D'])]) @@ -1739,7 +1780,7 @@ def test_remove_crit(self): # group row removal date4 = wp.major_axis.take([0, 1, 2, 4, 5, 6, 8, 9, 10]) crit4 = Term('major_axis', date4) - store.put('wp3', wp, table=True) + store.put('wp3', wp, fmt='t') n = store.remove('wp3', where=[crit4]) assert(n == 36) result = store.select('wp3') @@ -1747,7 +1788,7 @@ def test_remove_crit(self): assert_panel_equal(result, expected) # upper half - store.put('wp', wp, table=True) + store.put('wp', wp, fmt='t') date = wp.major_axis[len(wp.major_axis) // 2] crit1 = Term('major_axis', '>', date) @@ -1764,7 +1805,7 @@ def test_remove_crit(self): assert_panel_equal(result, expected) # individual row elements - store.put('wp2', wp, table=True) + store.put('wp2', wp, fmt='t') date1 = wp.major_axis[1:3] crit1 = Term('major_axis', date1) @@ -1790,7 +1831,7 @@ def test_remove_crit(self): assert_panel_equal(result, expected) # corners - store.put('wp4', wp, table=True) + store.put('wp4', wp, fmt='t') n = store.remove( 'wp4', where=[Term('major_axis', '>', wp.major_axis[-1])]) result = store.select('wp4') @@ -1802,8 +1843,8 @@ def test_terms(self): wp = tm.makePanel() p4d = tm.makePanel4D() - store.put('wp', wp, table=True) - store.put('p4d', p4d, table=True) + store.put('wp', wp, fmt='t') + store.put('p4d', p4d, fmt='t') # some invalid terms terms = [ @@ -2158,8 +2199,8 @@ def test_wide_table(self): def test_wide_table_dups(self): wp = tm.makePanel() with ensure_clean(self.path) as store: - store.put('panel', wp, table=True) - store.put('panel', wp, table=True, append=True) + store.put('panel', wp, fmt='t') + store.put('panel', wp, fmt='t', append=True) with tm.assert_produces_warning(expected_warning=DuplicateWarning): recons = store['panel'] @@ -2225,12 +2266,12 @@ def test_select(self): # put/select ok _maybe_remove(store, 'wp') - store.put('wp', wp, table=True) + store.put('wp', wp, fmt='t') store.select('wp') # non-table ok (where = None) _maybe_remove(store, 'wp') - store.put('wp2', wp, table=False) + store.put('wp2', wp) store.select('wp2') # selection on the non-indexable with a large number of columns @@ -2411,7 +2452,7 @@ def test_select_iterator(self): with tm.ensure_clean(self.path) as path: df = tm.makeTimeDataFrame(500) - df.to_hdf(path,'df',table=True) + df.to_hdf(path,'df',fmt='t') results = [] for x in read_hdf(path,'df',chunksize=100): @@ -2462,7 +2503,7 @@ def test_retain_index_attributes(self): with ensure_clean(self.path) as store: _maybe_remove(store,'data') - store.put('data', df, table=True) + store.put('data', df, fmt='t') result = store.get('data') tm.assert_frame_equal(df,result) @@ -2520,7 +2561,7 @@ def test_panel_select(self): wp = tm.makePanel() with ensure_clean(self.path) as store: - store.put('wp', wp, table=True) + store.put('wp', wp, fmt='t') date = wp.major_axis[len(wp.major_axis) // 2] crit1 = ('major_axis', '>=', date) @@ -2540,7 +2581,7 @@ def test_frame_select(self): df = tm.makeTimeDataFrame() with ensure_clean(self.path) as store: - store.put('frame', df, table=True) + store.put('frame', df,fmt='t') date = df.index[len(df) // 2] crit1 = ('index', '>=', date) @@ -2848,7 +2889,7 @@ def test_select_filter_corner(self): df.columns = ['%.3d' % c for c in df.columns] with ensure_clean(self.path) as store: - store.put('frame', df, table=True) + store.put('frame', df, fmt='t') crit = Term('columns', df.columns[:75]) result = store.select('frame', [crit]) @@ -2886,7 +2927,7 @@ def _check_roundtrip_table(self, obj, comparator, compression=False): options['complib'] = _default_compressor with ensure_clean(self.path, 'w', **options) as store: - store.put('obj', obj, table=True) + store.put('obj', obj, fmt='t') retrieved = store['obj'] # sorted_obj = _test_sort(obj) comparator(retrieved, obj) @@ -2897,7 +2938,7 @@ def test_multiple_open_close(self): with tm.ensure_clean(self.path) as path: df = tm.makeDataFrame() - df.to_hdf(path,'df',mode='w',table=True) + df.to_hdf(path,'df',mode='w',fmt='t') # single store = HDFStore(path) @@ -2959,7 +3000,7 @@ def test_multiple_open_close(self): with tm.ensure_clean(self.path) as path: df = tm.makeDataFrame() - df.to_hdf(path,'df',mode='w',table=True) + df.to_hdf(path,'df',mode='w',fmt='t') store = HDFStore(path) store.close() @@ -3202,7 +3243,7 @@ def test_store_datetime_mixed(self): # index=[np.arange(5).repeat(2), # np.tile(np.arange(2), 5)]) - # self.assertRaises(Exception, store.put, 'foo', df, table=True) + # self.assertRaises(Exception, store.put, 'foo', df, fmt='t') def _test_sort(obj):