diff --git a/RELEASE.rst b/RELEASE.rst index 7fe8b017cf71b..c5bb354c8bdc9 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -25,7 +25,7 @@ Where to get it pandas 0.8.0 ============ -**Release date:** NOT YET RELEASED +**Release date:** 6/26/2012 **New features** @@ -43,7 +43,7 @@ pandas 0.8.0 conversion method (#1018) - Implement robust frequency inference function and `inferred_freq` attribute on DatetimeIndex (#391) - - New ``tz_convert`` methods in Series / DataFrame + - New ``tz_convert`` and ``tz_localize`` methods in Series / DataFrame - Convert DatetimeIndexes to UTC if time zones are different in join/setops (#864) - Add limit argument for forward/backward filling to reindex, fillna, @@ -86,7 +86,10 @@ pandas 0.8.0 - Add lag plot (#1440) - Add autocorrelation_plot (#1425) - Add support for tox and Travis CI (#1382) - - Add support for ordered factors and use in GroupBy (#292) + - Add support for Categorical use in GroupBy (#292) + - Add ``any`` and ``all`` methods to DataFrame (#1416) + - Add ``secondary_y`` option to Series.plot + - Add experimental ``lreshape`` function for reshaping wide to long **Improvements to existing features** @@ -124,9 +127,20 @@ pandas 0.8.0 - Add ``convert_dtype`` option to Series.apply to be able to leave data as dtype=object (#1414) - Can specify all index level names in concat (#1419) + - Add ``dialect`` keyword to parsers for quoting conventions (#1363) + - Enable DataFrame[bool_DataFrame] += value (#1366) + - Add ``retries`` argument to ``get_data_yahoo`` to try to prevent Yahoo! API + 404s (#826) + - Improve performance of reshaping by using O(N) categorical sorting + - Series names will be used for index of DataFrame if no index passed (#1494) + - Header argument in DataFrame.to_csv can accept a list of column names to + use instead of the object's columns (#921) + - Add ``raise_conflict`` argument to DataFrame.update (#1526) **API Changes** + - Rename Factor to Categorical and add improvements. Numerous Categorical bug + fixes - Frequency name overhaul, WEEKDAY/EOM and rules with @ deprecated. get_legacy_offset_name backwards compatibility function added - Raise ValueError in DataFrame.__nonzero__, so "if df" no longer works @@ -190,6 +204,11 @@ pandas 0.8.0 - Fix outer/inner DataFrame.join with non-unique indexes (#1421) - Fix MultiIndex groupby bugs with empty lower levels (#1401) - Calling fillna with a Series will have same behavior as with dict (#1486) + - SparseSeries reduction bug (#1375) + - Fix unicode serialization issue in HDFStore (#1361) + - Pass keywords to pyplot.boxplot in DataFrame.boxplot (#1493) + - Bug fixes in MonthBegin (#1483) + - Preserve MultiIndex names in drop (#1513) pandas 0.7.3 ============ diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index 57d6471a8fe7b..f4c0eae4cfca0 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -217,3 +217,27 @@ passed in the index, thus finding the integers ``0`` and ``1``. While it would be possible to insert some logic to check whether a passed sequence is all contained in the index, that logic would exact a very high cost in large data sets. + +Timestamp limitations +--------------------- + +Minimum and maximum timestamps +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Since pandas represents timestamps in nanosecond resolution, the timespan that +can be represented using a 64-bit integer is limited to approximately 584 years: + +.. ipython:: python + + begin = Timestamp(-9223285636854775809L) + begin + end = Timestamp(np.iinfo(np.int64).max) + end + +If you need to represent time series data outside the nanosecond timespan, use +PeriodIndex: + +.. ipython:: python + + span = period_range('1215-01-01', '1381-01-01', freq='D') + span diff --git a/doc/source/io.rst b/doc/source/io.rst index 50a90f601c573..8c67b7d96d395 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -83,8 +83,10 @@ data into a DataFrame object. They can take a number of arguments: as the index. - ``names``: List of column names to use. If passed, header will be implicitly set to None. - - ``na_values``: optional list of strings to recognize as NaN (missing values), - in addition to a default set. + - ``na_values``: optional list of strings to recognize as NaN (missing + values), in addition to a default set. If you pass an empty list or an + empty list for a particular column, no values (including empty strings) + will be considered NA - ``parse_dates``: if True then index will be parsed as dates (False by default). You can specify more complicated options to parse a subset of columns or a combination of columns into a single date column diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index c616a07670e02..ac62bae5071cf 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -297,7 +297,7 @@ We could have done the same thing with ``DateOffset``: .. ipython:: python - from pandas.core.datetools import * + from pandas.tseries.offsets import * d + DateOffset(months=4, days=5) The key features of a ``DateOffset`` object are: diff --git a/doc/source/whatsnew/v0.4.x.txt b/doc/source/v0.4.x.txt similarity index 100% rename from doc/source/whatsnew/v0.4.x.txt rename to doc/source/v0.4.x.txt diff --git a/doc/source/whatsnew/v0.5.0.txt b/doc/source/v0.5.0.txt similarity index 100% rename from doc/source/whatsnew/v0.5.0.txt rename to doc/source/v0.5.0.txt diff --git a/doc/source/whatsnew/v0.6.0.txt b/doc/source/v0.6.0.txt similarity index 100% rename from doc/source/whatsnew/v0.6.0.txt rename to doc/source/v0.6.0.txt diff --git a/doc/source/whatsnew/v0.6.1.txt b/doc/source/v0.6.1.txt similarity index 100% rename from doc/source/whatsnew/v0.6.1.txt rename to doc/source/v0.6.1.txt diff --git a/doc/source/whatsnew/v0.7.0.txt b/doc/source/v0.7.0.txt similarity index 100% rename from doc/source/whatsnew/v0.7.0.txt rename to doc/source/v0.7.0.txt diff --git a/doc/source/whatsnew/v0.7.1.txt b/doc/source/v0.7.1.txt similarity index 100% rename from doc/source/whatsnew/v0.7.1.txt rename to doc/source/v0.7.1.txt diff --git a/doc/source/whatsnew/v0.7.2.txt b/doc/source/v0.7.2.txt similarity index 100% rename from doc/source/whatsnew/v0.7.2.txt rename to doc/source/v0.7.2.txt diff --git a/doc/source/whatsnew/v0.7.3.txt b/doc/source/v0.7.3.txt similarity index 100% rename from doc/source/whatsnew/v0.7.3.txt rename to doc/source/v0.7.3.txt diff --git a/doc/source/whatsnew/v0.8.0.txt b/doc/source/v0.8.0.txt similarity index 84% rename from doc/source/whatsnew/v0.8.0.txt rename to doc/source/v0.8.0.txt index 5ea0be0ddd971..c4049c208029e 100644 --- a/doc/source/whatsnew/v0.8.0.txt +++ b/doc/source/v0.8.0.txt @@ -67,15 +67,16 @@ Time series changes and improvements PeriodIndex and DatetimeIndex - New Timestamp data type subclasses `datetime.datetime`, providing the same interface while enabling working with nanosecond-resolution data. Also - provides **easy time zone conversions** -- Enhanced support for **time zones**. Add `tz_convert` methods to TimeSeries - and DataFrame. All timestamps are stored as UTC; Timestamps from - DatetimeIndex objects with time zone set will be localized to localtime. Time - zone conversions are therefore essentially free. User needs to know very - little about pytz library now; only time zone names as as strings are - required. Timestamps are equal if and only if their UTC timestamps - match. Operations between time series with different time zones will result - in a UTC-indexed time series + provides :ref:`easy time zone conversions `. +- Enhanced support for :ref:`time zones `. Add + `tz_convert` and ``tz_lcoalize`` methods to TimeSeries and DataFrame. All + timestamps are stored as UTC; Timestamps from DatetimeIndex objects with time + zone set will be localized to localtime. Time zone conversions are therefore + essentially free. User needs to know very little about pytz library now; only + time zone names as as strings are required. Time zone-aware timestamps are + equal if and only if their UTC timestamps match. Operations between time + zone-aware time series with different time zones will result in a UTC-indexed + time series. - Time series **string indexing conveniences** / shortcuts: slice years, year and month, and index values with strings - Enhanced time series **plotting**; adaptation of scikits.timeseries @@ -111,8 +112,11 @@ index duplication in many-to-many joins) Other new features ~~~~~~~~~~~~~~~~~~ -- New :ref:`cut ` function (like R's cut function) for - computing a categorical variable from a continuous variable by binning values +- New :ref:`cut ` and ``qcut`` functions (like R's cut + function) for computing a categorical variable from a continuous variable by + binning values either into value-based (``cut``) or quantile-based (``qcut``) + bins +- Rename ``Factor`` to ``Categorical`` and add a number of usability features - Add :ref:`limit ` argument to fillna/reindex - More flexible multiple function application in GroupBy, and can pass list (name, function) tuples to get result in particular order with given names @@ -133,8 +137,8 @@ Other new features memory usage than Python's dict - Add first, last, min, max, and prod optimized GroupBy functions - New :ref:`ordered_merge ` function -- Add flexible :ref:`comparison ` instance methods eq, ne, lt, gt, etc. to DataFrame, - Series +- Add flexible :ref:`comparison ` instance methods eq, ne, lt, + gt, etc. to DataFrame, Series - Improve :ref:`scatter_matrix ` plotting function and add histogram or kernel density estimates to diagonal - Add :ref:`'kde' ` plot option for density plots @@ -146,6 +150,42 @@ Other new features - Can select multiple columns from GroupBy - Add :ref:`update ` methods to Series/DataFrame for updating values in place +- Add ``any`` and ``all method to DataFrame + +New plotting methods +~~~~~~~~~~~~~~~~~~~~ + +.. ipython:: python + :suppress: + + import pandas as pd + fx = pd.load('data/fx_prices') + import matplotlib.pyplot as plt + +``Series.plot`` now supports a ``secondary_y`` option: + +.. ipython:: python + + plt.figure() + + fx['FR'].plot(style='g') + + @savefig whatsnew_secondary_y.png width=4.5in + fx['IT'].plot(style='k--', secondary_y=True) + +Vytautas Jancauskas, the 2012 GSOC participant, has added many new plot +types. For example, ``'kde'`` is a new option: + +.. ipython:: python + + s = Series(np.concatenate((np.random.randn(1000), + np.random.randn(1000) * 0.5 + 3))) + plt.figure() + s.hist(normed=True, alpha=0.2) + @savefig whatsnew_kde.png width=4.5in + s.plot(kind='kde') + +See :ref:`the plotting page ` for much more. Other API changes ~~~~~~~~~~~~~~~~~ diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 34bd803516468..eb15d7f77252d 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -91,6 +91,20 @@ You may pass ``logy`` to get a log-scale Y axis. @savefig series_plot_logy.png width=4.5in ts.plot(logy=True) +Plotting on a Secondary Y-axis +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To plot data on a secondary y-axis, use the ``secondary_y`` keyword: + +.. ipython:: python + + plt.figure() + + df.A.plot() + + @savefig series_plot_secondary_y.png width=4.5in + df.B.plot(secondary_y=True, style='g') + Targeting different subplots ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -107,6 +121,8 @@ You can pass an ``ax`` argument to ``Series.plot`` to plot on a particular axis: @savefig series_plot_multi.png width=4.5in df['D'].plot(ax=axes[1,1]); axes[1,1].set_title('D') +.. _visualization.other: + Other plotting features ----------------------- diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index b930bdbbde1b1..e60baa37820c9 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -16,21 +16,21 @@ What's New These are new features and improvements of note in each release. -.. include:: whatsnew/v0.8.0.txt +.. include:: v0.8.0.txt -.. include:: whatsnew/v0.7.3.txt +.. include:: v0.7.3.txt -.. include:: whatsnew/v0.7.2.txt +.. include:: v0.7.2.txt -.. include:: whatsnew/v0.7.1.txt +.. include:: v0.7.1.txt -.. include:: whatsnew/v0.7.0.txt +.. include:: v0.7.0.txt -.. include:: whatsnew/v0.6.1.txt +.. include:: v0.6.1.txt -.. include:: whatsnew/v0.6.0.txt +.. include:: v0.6.0.txt -.. include:: whatsnew/v0.5.0.txt +.. include:: v0.5.0.txt -.. include:: whatsnew/v0.4.x.txt +.. include:: v0.4.x.txt diff --git a/pandas/core/api.py b/pandas/core/api.py index df164575e3b04..8cf3b7f4cbda4 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -15,7 +15,8 @@ from pandas.core.frame import DataFrame from pandas.core.panel import Panel from pandas.core.groupby import groupby -from pandas.core.reshape import pivot_simple as pivot, get_dummies +from pandas.core.reshape import (pivot_simple as pivot, get_dummies, + lreshape) WidePanel = Panel diff --git a/pandas/core/common.py b/pandas/core/common.py index 7d94e55c6d597..4db15c6c69101 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -914,3 +914,15 @@ def writerow(self, row): self.stream.write(data) # empty queue self.queue.truncate(0) + + +_NS_DTYPE = np.dtype('M8[ns]') + +def _concat_compat(to_concat): + if all(x.dtype == _NS_DTYPE for x in to_concat): + # work around NumPy 1.6 bug + new_values = np.concatenate([x.view(np.int64) for x in to_concat]) + return new_values.view(_NS_DTYPE) + else: + return np.concatenate(to_concat) + diff --git a/pandas/core/format.py b/pandas/core/format.py index 59ab7b473d99a..776a0fd80291f 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -594,19 +594,7 @@ def _format_datetime64(x, tz=None): return 'NaT' stamp = lib.Timestamp(x, tz=tz) - base = stamp.strftime('%Y-%m-%d %H:%M:%S') - - fraction = stamp.microsecond * 1000 + stamp.nanosecond - digits = 9 - - if fraction == 0: - return base - - while (fraction % 10) == 0: - fraction /= 10 - digits -= 1 - - return base + ('.%%.%id' % digits) % fraction + return stamp._repr_base def _make_fixed_width(strings, justify='right'): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b071bca8cf5de..9a3c47302707f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -166,6 +166,9 @@ merged : DataFrame """ +# Custom error class for update + +class DataConflictError(Exception): pass #---------------------------------------------------------------------- # Factory helper methods @@ -389,6 +392,9 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, copy=copy) elif isinstance(data, list): if len(data) > 0: + if index is None and isinstance(data[0], Series): + index = _get_names_from_index(data) + if isinstance(data[0], (list, tuple, dict, Series)): conv_data, columns = _to_sdict(data, columns) if isinstance(conv_data, dict): @@ -615,12 +621,16 @@ def iterrows(self): s.name = k yield k, s - def itertuples(self): + def itertuples(self, index=True): """ Iterate over rows of DataFrame as tuples, with index value as first element of the tuple """ - return izip(self.index, *self.values.T) + arrays = [] + if index: + arrays.append(self.index) + arrays.extend(self[k] for k in self.columns) + return izip(*arrays) iterkv = iteritems if py3compat.PY3: # pragma: no cover @@ -1017,15 +1027,17 @@ def to_panel(self): to_wide = deprecate('to_wide', to_panel) - def _helper_csvexcel(self, writer, na_rep=None, cols=None, header=True, - index=True, index_label=None): + def _helper_csvexcel(self, writer, na_rep=None, cols=None, + header=True, index=True, index_label=None): if cols is None: cols = self.columns series = {} for k, v in self._series.iteritems(): series[k] = v.values - if header: + + has_aliases = isinstance(header, (tuple, list, np.ndarray)) + if has_aliases or header: if index: # should write something for index label if index_label is None: @@ -1046,7 +1058,15 @@ def _helper_csvexcel(self, writer, na_rep=None, cols=None, header=True, index_label = [index_label] encoded_labels = list(index_label) - encoded_cols = list(cols) + if has_aliases: + if len(header) != len(cols): + raise ValueError(('Writing %d cols but got %d aliases' + % (len(cols), len(header)))) + else: + write_cols = header + else: + write_cols = cols + encoded_cols = list(write_cols) writer.writerow(encoded_labels + encoded_cols) else: @@ -1071,8 +1091,8 @@ def _helper_csvexcel(self, writer, na_rep=None, cols=None, header=True, writer.writerow(row_fields) def to_csv(self, path_or_buf, sep=",", na_rep='', cols=None, - header=True, index=True, index_label=None, mode='w', - nanRep=None, encoding=None): + header=True, index=True, index_label=None, + mode='w', nanRep=None, encoding=None): """ Write DataFrame to a comma-separated values (csv) file @@ -1084,8 +1104,9 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', cols=None, Missing data representation cols : sequence, optional Columns to write - header : boolean, default True - Write out column names + header : boolean or list of string, default True + Write out column names. If a list of string is given it is + assumed to be aliases for the column names index : boolean, default True Write row names (index) index_label : string or sequence, default None @@ -1121,6 +1142,7 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', cols=None, self._helper_csvexcel(csvout, na_rep=na_rep, cols=cols, header=header, index=index, index_label=index_label) + finally: if close: f.close() @@ -1140,8 +1162,9 @@ def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', Missing data rep'n cols : sequence, optional Columns to write - header : boolean, default True - Write out column names + header : boolean or list of string, default True + Write out column names. If a list of string is given it is + assumed to be aliases for the column names index : boolean, default True Write row names (index) index_label : string or sequence, default None @@ -3148,7 +3171,8 @@ def combine_first(self, other): combiner = lambda x, y: np.where(isnull(x), y, x) return self.combine(other, combiner) - def update(self, other, join='left', overwrite=True, filter_func=None): + def update(self, other, join='left', overwrite=True, filter_func=None, + raise_conflict=False): """ Modify DataFrame in place using non-NA values from passed DataFrame. Aligns on indices @@ -3162,6 +3186,9 @@ def update(self, other, join='left', overwrite=True, filter_func=None): filter_func : callable(1d-array) -> 1d-array, default None Can choose to replace values other than NA. Return True for values that should be updated + raise_conflict : bool + If True, will raise an error if the DataFrame and other both + contain data in the same place. """ if join != 'left': raise NotImplementedError @@ -3173,6 +3200,12 @@ def update(self, other, join='left', overwrite=True, filter_func=None): if filter_func is not None: mask = -filter_func(this) | isnull(that) else: + if raise_conflict: + mask_this = notnull(that) + mask_that = notnull(this) + if any(mask_this & mask_that): + raise DataConflictError("Data overlaps.") + if overwrite: mask = isnull(that) else: @@ -4222,9 +4255,9 @@ def _reduce(self, op, axis=0, skipna=True, numeric_only=None, if result.dtype == np.object_: try: if filter_type is None or filter_type == 'numeric': - result = result.astype('f8') - elif filter_type == 'bool': - result = result.astype('b') + result = result.astype(np.float64) + elif filter_type == 'bool' and notnull(result).all(): + result = result.astype(np.bool_) else: raise ValueError('Invalid dtype %s ' % str(filter_type)) @@ -4604,7 +4637,7 @@ def extract_index(data): elif isinstance(v, dict): have_dicts = True indexes.append(v.keys()) - elif isinstance(v, (list, np.ndarray)): + elif isinstance(v, (list, tuple, np.ndarray)): have_raw_arrays = True raw_lengths.append(len(v)) @@ -4754,6 +4787,22 @@ def _convert_object_array(content, columns, coerce_float=False): for c, vals in zip(columns, content)) return sdict, columns +def _get_names_from_index(data): + index = range(len(data)) + has_some_name = any([s.name is not None for s in data]) + if not has_some_name: + return index + + count = 0 + for i, s in enumerate(data): + n = s.name + if n is not None: + index[i] = n + else: + index[i] = 'Unnamed %d' % count + count += 1 + + return index def _homogenize(data, index, columns, dtype=None): from pandas.core.series import _sanitize_array diff --git a/pandas/core/index.py b/pandas/core/index.py index db0bd1bd7147e..35096651b90ef 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -405,7 +405,7 @@ def asof(self, label): def asof_locs(self, where, mask): """ where : array of timestamps - mask : array of booleans where data is NA + mask : array of booleans where data is not NA """ locs = self.values[mask].searchsorted(where.values, side='right') @@ -2295,7 +2295,8 @@ def delete(self, loc): new_index : MultiIndex """ new_labels = [np.delete(lab, loc) for lab in self.labels] - return MultiIndex(levels=self.levels, labels=new_labels) + return MultiIndex(levels=self.levels, labels=new_labels, + names=self.names) get_major_bounds = slice_locs diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 83278e3c994d0..1966b51f5b2e7 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1389,7 +1389,7 @@ def _consolidate(blocks, items): def _merge_blocks(blocks, items): if len(blocks) == 1: return blocks[0] - new_values = np.vstack([b.values for b in blocks]) + new_values = _vstack([b.values for b in blocks]) new_items = blocks[0].items.append([b.items for b in blocks[1:]]) new_block = make_block(new_values, new_items, items, do_integrity_check=True) @@ -1422,3 +1422,11 @@ def _union_items_slow(all_items): else: seen = seen.union(items) return seen + +def _vstack(to_stack): + if all(x.dtype == _NS_DTYPE for x in to_stack): + # work around NumPy 1.6 bug + new_values = np.vstack([x.view('i8') for x in to_stack]) + return new_values.view(_NS_DTYPE) + else: + return np.vstack(to_stack) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 08d45ddf63bd5..475a6822edfea 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -12,6 +12,7 @@ from pandas.core.common import notnull, _ensure_platform_int from pandas.core.groupby import (get_group_index, _compress_group_index, decons_group_index) +import pandas.core.common as com import pandas.lib as lib @@ -559,6 +560,72 @@ def melt(frame, id_vars=None, value_vars=None): mdata['variable'] = np.asarray(frame.columns).repeat(N) return DataFrame(mdata, columns=mcolumns) + +def lreshape(data, groups, dropna=True, label=None): + """ + Reshape long-format data to wide. Generalized inverse of DataFrame.pivot + + Parameters + ---------- + data : DataFrame + groups : dict + {new_name : list_of_columns} + dropna : boolean, default True + + Examples + -------- + >>> data + hr1 hr2 team year1 year2 + 0 514 545 Red Sox 2007 2008 + 1 573 526 Yankees 2007 2008 + + >>> pd.lreshape(data, {'year': ['year1', 'year2'], + 'hr': ['hr1', 'hr2']}) + team hr year + 0 Red Sox 514 2007 + 1 Yankees 573 2007 + 2 Red Sox 545 2008 + 3 Yankees 526 2008 + + Returns + ------- + reshaped : DataFrame + """ + if isinstance(groups, dict): + keys = groups.keys() + values = groups.values() + else: + keys, values = zip(*groups) + + all_cols = list(set.union(*[set(x) for x in values])) + id_cols = list(data.columns.diff(all_cols)) + + K = len(values[0]) + + for seq in values: + if len(seq) != K: + raise ValueError('All column lists must be same length') + + mdata = {} + pivot_cols = [] + + for target, names in zip(keys, values): + mdata[target] = com._concat_compat([data[col].values for col in names]) + pivot_cols.append(target) + + for col in id_cols: + mdata[col] = np.tile(data[col].values, K) + + if dropna: + mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool) + for c in pivot_cols: + mask &= notnull(mdata[c]) + if not mask.all(): + mdata = dict((k, v[mask]) for k, v in mdata.iteritems()) + + return DataFrame(mdata, columns=id_cols + pivot_cols) + + def convert_dummies(data, cat_variables, prefix_sep='_'): """ Compute DataFrame with specified columns converted to dummy variables (0 / diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8261b889b4500..fe07c37a49f36 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -662,7 +662,7 @@ def _get_index_name(self, columns=None): index_name = None elif np.isscalar(self.index_col): if isinstance(self.index_col, basestring): - index_names = self.index_col + index_name = self.index_col for i, c in enumerate(list(columns)): if c == self.index_col: self.index_col = i @@ -1029,9 +1029,9 @@ def _convert_types(values, na_values): return values, na_count try: - result = lib.maybe_convert_numeric(values, na_values) + result = lib.maybe_convert_numeric(values, na_values, False) except Exception: - na_count = lib.sanitize_objects(values, na_values) + na_count = lib.sanitize_objects(values, na_values, False) result = values if result.dtype == np.object_: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 52af5eff72336..4ec176556eaa8 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -716,7 +716,7 @@ def _write_array(self, group, key, value): vlarr.append(value) elif value.dtype.type == np.datetime64: self.handle.createArray(group, key, value.view('i8')) - group._v_attrs.value_type = 'datetime64' + getattr(group, key)._v_attrs.value_type = 'datetime64' else: self.handle.createArray(group, key, value) @@ -958,7 +958,7 @@ def _read_array(group, key): if isinstance(node, tables.VLArray): return data[0] else: - dtype = getattr(group._v_attrs, 'value_type', None) + dtype = getattr(node._v_attrs, 'value_type', None) if dtype == 'datetime64': return np.array(data, dtype='M8[ns]') return data diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 7a75e687a4a8a..84a3b165a0af8 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -52,6 +52,32 @@ def setUp(self): self.csv2 = os.path.join(self.dirpath, 'test2.csv') self.xls1 = os.path.join(self.dirpath, 'test.xls') + def test_empty_string(self): + data = """\ +One,Two,Three +a,1,one +b,2,two +,3,three +d,4,nan +e,5,five +nan,6, +g,7,seven +""" + df = read_csv(StringIO(data)) + xp = DataFrame({'One' : ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'], + 'Two' : [1,2,3,4,5,6,7], + 'Three' : ['one', 'two', 'three', np.nan, 'five', + np.nan, 'seven']}) + assert_frame_equal(xp.reindex(columns=df.columns), df) + + df = read_csv(StringIO(data), na_values={'One': [], 'Three': []}) + xp = DataFrame({'One' : ['a', 'b', '', 'd', 'e', 'nan', 'g'], + 'Two' : [1,2,3,4,5,6,7], + 'Three' : ['one', 'two', 'three', 'nan', 'five', + '', 'seven']}) + assert_frame_equal(xp.reindex(columns=df.columns), df) + + def test_read_csv(self): pass @@ -308,6 +334,23 @@ def test_index_col_named(self): self.assertRaises(ValueError, read_csv, StringIO(no_header), index_col='ID') + data = """\ +1,2,3,4,hello +5,6,7,8,world +9,10,11,12,foo +""" + names = ['a', 'b', 'c', 'd', 'message'] + xp = DataFrame({'a' : [1, 5, 9], 'b' : [2, 6, 10], 'c' : [3, 7, 11], + 'd' : [4, 8, 12]}, + index=Index(['hello', 'world', 'foo'], name='message')) + rs = read_csv(StringIO(data), names=names, index_col=['message']) + assert_frame_equal(xp, rs) + self.assert_(xp.index.name == rs.index.name) + + rs = read_csv(StringIO(data), names=names, index_col='message') + assert_frame_equal(xp, rs) + self.assert_(xp.index.name == rs.index.name) + def test_multiple_skts_example(self): data = "year, month, a, b\n 2001, 01, 0.0, 10.\n 2001, 02, 1.1, 11." pass diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 8576b3d69ad99..87a6329ef9368 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -660,6 +660,11 @@ def test_unicode_index(self): s = Series(np.random.randn(len(unicode_values)), unicode_values) self._check_roundtrip(s, tm.assert_series_equal) + def test_store_datetime_mixed(self): + df = DataFrame({'a': [1,2,3], 'b': [1.,2.,3.], 'c': ['a', 'b', 'c']}) + ts = tm.makeTimeSeries() + df['d'] = ts.index[:3] + self._check_roundtrip(df, tm.assert_frame_equal) def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) diff --git a/pandas/src/datetime.pyx b/pandas/src/datetime.pyx index ec50275ccc4a6..f7e325c4ae718 100644 --- a/pandas/src/datetime.pyx +++ b/pandas/src/datetime.pyx @@ -65,6 +65,7 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None): return result + # Python front end to C extension type _Timestamp # This serves as the box for datetime64 class Timestamp(_Timestamp): @@ -101,10 +102,33 @@ class Timestamp(_Timestamp): return ts_base def __repr__(self): - result = self.strftime('' + result = self._repr_base + + try: + result += self.strftime('%z') + if self.tzinfo: + result += self.strftime(' %%Z, tz=%s' % self.tzinfo.zone) + except ValueError: + year2000 = self.replace(year=2000) + result += year2000.strftime('%z') + if self.tzinfo: + result += year2000.strftime(' %%Z, tz=%s' % self.tzinfo.zone) + + return '' % result + + @property + def _repr_base(self): + result = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (self.year, self.month, + self.day, self.hour, + self.minute, self.second) + + if self.nanosecond != 0: + nanos = self.nanosecond + 1000 * self.microsecond + result += '.%.9d' % nanos + elif self.microsecond != 0: + result += '.%.6d' % self.microsecond + + return result @property def tz(self): @@ -507,6 +531,7 @@ cpdef convert_to_tsobject(object ts, object tz=None): obj.tzinfo = ts.tzinfo if obj.tzinfo is not None: obj.value -= _delta_to_nanoseconds(obj.tzinfo._utcoffset) + _check_dts_bounds(obj.value, &obj.dts) return obj elif PyDate_Check(ts): obj.value = _date_to_datetime64(ts, &obj.dts) @@ -514,6 +539,9 @@ cpdef convert_to_tsobject(object ts, object tz=None): raise ValueError("Could not construct Timestamp from argument %s" % type(ts)) + if obj.value != NPY_NAT: + _check_dts_bounds(obj.value, &obj.dts) + if tz is not None: if tz is pytz.utc: obj.tzinfo = tz @@ -530,6 +558,20 @@ cpdef convert_to_tsobject(object ts, object tz=None): return obj +cdef int64_t _NS_LOWER_BOUND = -9223285636854775809LL +cdef int64_t _NS_UPPER_BOUND = -9223372036854775807LL + +cdef inline _check_dts_bounds(int64_t value, pandas_datetimestruct *dts): + cdef pandas_datetimestruct dts2 + if dts.year <= 1677 or dts.year >= 2262: + pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts2) + if dts2.year != dts.year: + fmt = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (dts.year, dts.month, + dts.day, dts.hour, + dts.min, dts.sec) + + raise ValueError('Out of bounds nanosecond timestamp: %s' % fmt) + # elif isinstance(ts, _Timestamp): # tmp = ts # obj.value = (<_Timestamp> ts).value @@ -613,8 +655,10 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False): iresult[i] = iNaT elif PyDateTime_Check(val): iresult[i] = _pydatetime_to_dts(val, &dts) + _check_dts_bounds(iresult[i], &dts) elif PyDate_Check(val): iresult[i] = _date_to_datetime64(val, &dts) + _check_dts_bounds(iresult[i], &dts) elif util.is_datetime64_object(val): iresult[i] = _get_datetime64_nanos(val) elif util.is_integer_object(val): @@ -627,6 +671,9 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False): result[i] = parse(val, dayfirst=dayfirst) except Exception: raise TypeError + pandas_datetime_to_datetimestruct(iresult[i], PANDAS_FR_ns, + &dts) + _check_dts_bounds(iresult[i], &dts) return result except TypeError: oresult = np.empty(n, dtype=object) diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 4057d240fceb6..451b998b0481b 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -241,7 +241,8 @@ def is_date_array(ndarray[object] values): return True -def maybe_convert_numeric(ndarray[object] values, set na_values): +def maybe_convert_numeric(ndarray[object] values, set na_values, + convert_empty=True): ''' Type inference function-- convert strings to numeric (potentially) and convert to proper dtype array @@ -275,8 +276,11 @@ def maybe_convert_numeric(ndarray[object] values, set na_values): floats[i] = complexes[i] = nan seen_float = 1 elif len(val) == 0: - floats[i] = complexes[i] = nan - seen_float = 1 + if convert_empty: + floats[i] = complexes[i] = nan + seen_float = 1 + else: + raise ValueError('Empty string encountered') elif util.is_complex_object(val): complexes[i] = val seen_complex = 1 @@ -573,7 +577,8 @@ def try_parse_datetime_components(ndarray[object] years, ndarray[object] months, return result -def sanitize_objects(ndarray[object] values, set na_values): +def sanitize_objects(ndarray[object] values, set na_values, + convert_empty=True): cdef: Py_ssize_t i, n object val, onan @@ -585,7 +590,7 @@ def sanitize_objects(ndarray[object] values, set na_values): for i from 0 <= i < n: val = values[i] - if val == '' or val in na_values: + if (convert_empty and val == '') or (val in na_values): values[i] = onan na_count += 1 elif val in memo: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 6c3cc89f18f1e..e3794222d0481 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -13,6 +13,7 @@ from numpy.random import randn import numpy as np import numpy.ma as ma +from numpy.testing import assert_array_equal import pandas as pan import pandas.core.nanops as nanops @@ -21,7 +22,7 @@ import pandas.core.datetools as datetools from pandas.core.api import (DataFrame, Index, Series, notnull, isnull, MultiIndex, DatetimeIndex) -from pandas.io.parsers import (ExcelFile, ExcelWriter) +from pandas.io.parsers import (ExcelFile, ExcelWriter, read_csv) from pandas.util.testing import (assert_almost_equal, assert_series_equal, @@ -1579,6 +1580,14 @@ def test_constructor_dict_dont_upcast(self): dm = DataFrame([[1,2],['a','b']], index=[1,2], columns=[1,2]) self.assert_(isinstance(dm[1][1], int)) + def test_constructor_dict_of_tuples(self): + # GH #1491 + data = {'a': (1, 2, 3), 'b': (4, 5, 6)} + + result = DataFrame(data) + expected = DataFrame(dict((k, list(v)) for k, v in data.iteritems())) + assert_frame_equal(result, expected) + def test_constructor_ndarray(self): mat = np.zeros((2, 3), dtype=float) @@ -1863,6 +1872,28 @@ def test_constructor_list_of_dicts(self): assert_frame_equal(result, expected) def test_constructor_list_of_series(self): + data = [{'a': 1.5, 'b': 3.0, 'c':4.0}, + {'a': 1.5, 'b': 3.0, 'c':6.0}] + sdict = dict(zip(['x', 'y'], data)) + idx = Index(['a', 'b', 'c']) + + # all named + data2 = [Series([1.5, 3, 4], idx, dtype='O', name='x'), + Series([1.5, 3, 6], idx, name='y')] + result = DataFrame(data2) + expected = DataFrame.from_dict(sdict, orient='index') + assert_frame_equal(result, expected) + + # some unnamed + data2 = [Series([1.5, 3, 4], idx, dtype='O', name='x'), + Series([1.5, 3, 6], idx)] + result = DataFrame(data2) + + sdict = dict(zip(['x', 'Unnamed 0'], data)) + expected = DataFrame.from_dict(sdict, orient='index') + assert_frame_equal(result.sort_index(), expected) + + # none named data = [{'a': 1.5, 'b': 3, 'c':4, 'd':6}, {'a': 1.5, 'b': 3, 'd':6}, {'a': 1.5, 'd':6}, @@ -2536,6 +2567,12 @@ def test_itertuples(self): expected = self.frame.ix[i,:].reset_index(drop=True) assert_series_equal(s, expected) + df = DataFrame({'floats': np.random.randn(5), + 'ints': range(5)}, columns=['floats', 'ints']) + + for tup in df.itertuples(index=False): + self.assert_(isinstance(tup[1], np.integer)) + def test_len(self): self.assertEqual(len(self.frame), len(self.frame.index)) @@ -3064,6 +3101,19 @@ def test_to_csv_from_csv(self): result = DataFrame.from_csv(path, index_col=[0, 1, 2], parse_dates=False) assert_frame_equal(result, df) + + # column aliases + col_aliases = Index(['AA', 'X', 'Y', 'Z']) + self.frame2.to_csv(path, header=col_aliases) + rs = DataFrame.from_csv(path) + xp = self.frame2.copy() + xp.columns = col_aliases + + assert_frame_equal(xp, rs) + + self.assertRaises(ValueError, self.frame2.to_csv, path, + header=['AA', 'X']) + os.remove(path) def test_to_csv_multiindex(self): @@ -3248,6 +3298,14 @@ def test_to_excel_from_excel(self): np.testing.assert_equal('test1', reader.sheet_names[0]) np.testing.assert_equal('test2', reader.sheet_names[1]) + # column aliases + col_aliases = Index(['AA', 'X', 'Y', 'Z']) + self.frame2.to_excel(path, 'test1', header=col_aliases) + reader = ExcelFile(path) + rs = reader.parse('test1', index_col=0) + xp = self.frame2.copy() + xp.columns = col_aliases + assert_frame_equal(xp, rs) os.remove(path) @@ -5290,6 +5348,17 @@ def test_update_filtered(self): [1.5, nan, 7.]]) assert_frame_equal(df, expected) + def test_update_raise(self): + df = DataFrame([[1.5, 1, 3.], + [1.5, nan, 3.], + [1.5, nan, 3], + [1.5, nan, 3]]) + + other = DataFrame([[2., nan], + [nan, 7]], index=[1, 3], columns=[1,2]) + + np.testing.assert_raises(Exception, df.update, *(other,), + **{'raise_conflict' : True}) def test_combineAdd(self): # trivial @@ -6325,6 +6394,35 @@ def test_any_all(self): self._check_bool_op('any', np.any, has_skipna=True, has_bool_only=True) self._check_bool_op('all', np.all, has_skipna=True, has_bool_only=True) + def test_consolidate_datetime64(self): + # numpy vstack bug + + data = """\ +starting,ending,measure +2012-06-21 00:00,2012-06-23 07:00,77 +2012-06-23 07:00,2012-06-23 16:30,65 +2012-06-23 16:30,2012-06-25 08:00,77 +2012-06-25 08:00,2012-06-26 12:00,0 +2012-06-26 12:00,2012-06-27 08:00,77 +""" + df = read_csv(StringIO(data), parse_dates=[0,1]) + + ser_starting = df.starting + ser_starting.index = ser_starting.values + ser_starting = ser_starting.tz_localize('US/Eastern') + ser_starting = ser_starting.tz_convert('UTC') + + ser_ending = df.ending + ser_ending.index = ser_ending.values + ser_ending = ser_ending.tz_localize('US/Eastern') + ser_ending = ser_ending.tz_convert('UTC') + + df.starting = ser_starting.index + df.ending = ser_ending.index + + assert_array_equal(df.starting.values, ser_starting.index.values) + assert_array_equal(df.ending.values, ser_ending.index.values) + def _check_bool_op(self, name, alternative, frame=None, has_skipna=True, has_bool_only=False): if frame is None: diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 65fc503f2eb7f..2c386076b69b7 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -194,6 +194,9 @@ def test_boxplot(self): _check_plot_works(df.boxplot, notch=1) _check_plot_works(df.boxplot, by='indic', notch=1) + df = DataFrame(np.random.rand(10,2), columns=['Col1', 'Col2'] ) + df['X'] = Series(['A','A','A','A','A','B','B','B','B','B']) + _check_plot_works(df.boxplot, by='X') @slow def test_kde(self): @@ -223,6 +226,9 @@ def test_hist(self): df = DataFrame(np.random.randn(100, 6)) _check_plot_works(df.hist) + #make sure sharex, sharey is handled + _check_plot_works(df.hist, sharex=True, sharey=True) + #make sure kwargs are handled ser = df[0] xf, yf = 20, 20 diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index e6698ff52afaf..1189df2ac3ce6 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1355,6 +1355,16 @@ def test_drop_level(self): expected = self.frame.ix[[0, 2, 3, 6, 7, 9]].T assert_frame_equal(result, expected) + def test_drop_preserve_names(self): + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], + [1, 2, 3, 1, 2, 3]], + names=['one', 'two']) + + df = DataFrame(np.random.randn(6, 3), index=index) + + result = df.drop([(0, 2)]) + self.assert_(result.index.names == ['one', 'two']) + def test_unicode_repr_issues(self): levels = [Index([u'a/\u03c3', u'b/\u03c3',u'c/\u03c3']), Index([0, 1])] diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 036ee8677b41c..3cb600ff69513 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -1,8 +1,20 @@ +# pylint: disable-msg=W0612,E1101 +from copy import deepcopy +from datetime import datetime, timedelta +from StringIO import StringIO +import cPickle as pickle +import operator +import os +import unittest + +import nose + from pandas import DataFrame +from numpy import nan import numpy as np -from pandas.core.reshape import melt, convert_dummies +from pandas.core.reshape import melt, convert_dummies, lreshape import pandas.util.testing as tm def test_melt(): @@ -38,8 +50,79 @@ def test_convert_dummies(): tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected2) +class Test_lreshape(unittest.TestCase): + + def test_pairs(self): + data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', + '21dec2008', '11jan2009'], + 'birthwt': [1766, 3301, 1454, 3139, 4133], + 'id': [101, 102, 103, 104, 105], + 'sex': ['Male', 'Female', 'Female', 'Female', 'Female'], + 'visitdt1': ['11jan2009', '22dec2008', '04jan2009', + '29dec2008', '20jan2009'], + 'visitdt2': ['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'], + 'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'], + 'wt1': [1823, 3338, 1549, 3298, 4306], + 'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0], + 'wt3': [2293.0, nan, nan, 3377.0, 4805.0]} + + df = DataFrame(data) + + spec = {'visitdt': ['visitdt%d' % i for i in range(1, 4)], + 'wt': ['wt%d' % i for i in range(1, 4)]} + result = lreshape(df, spec) + + exp_data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', + '21dec2008', '11jan2009', '08jan2009', + '30dec2008', '21dec2008', '11jan2009', + '08jan2009', '21dec2008', '11jan2009'], + 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, + 1454, 3139, 4133, 1766, 3139, 4133], + 'id': [101, 102, 103, 104, 105, 101, + 103, 104, 105, 101, 104, 105], + 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', + 'Male', 'Female', 'Female', 'Female', 'Male', + 'Female', 'Female'], + 'visitdt': ['11jan2009', '22dec2008', '04jan2009', '29dec2008', + '20jan2009', '21jan2009', '22jan2009', '31dec2008', + '03feb2009', '05feb2009', '02jan2009', '15feb2009'], + 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, + 1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0]} + exp = DataFrame(exp_data, columns=result.columns) + tm.assert_frame_equal(result, exp) + + result = lreshape(df, spec, dropna=False) + exp_data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', + '21dec2008', '11jan2009', + '08jan2009', '20dec2008', '30dec2008', + '21dec2008', '11jan2009', + '08jan2009', '20dec2008', '30dec2008', + '21dec2008', '11jan2009'], + 'birthwt': [1766, 3301, 1454, 3139, 4133, + 1766, 3301, 1454, 3139, 4133, + 1766, 3301, 1454, 3139, 4133], + 'id': [101, 102, 103, 104, 105, + 101, 102, 103, 104, 105, + 101, 102, 103, 104, 105], + 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', + 'Male', 'Female', 'Female', 'Female', 'Female', + 'Male', 'Female', 'Female', 'Female', 'Female'], + 'visitdt': ['11jan2009', '22dec2008', '04jan2009', + '29dec2008', '20jan2009', + '21jan2009', nan, '22jan2009', + '31dec2008', '03feb2009', + '05feb2009', nan, nan, '02jan2009', '15feb2009'], + 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, + nan, 1892.0, 3338.0, 4575.0, 2293.0, nan, nan, + 3377.0, 4805.0]} + exp = DataFrame(exp_data, columns=result.columns) + tm.assert_frame_equal(result, exp) + + spec = {'visitdt': ['visitdt%d' % i for i in range(1, 3)], + 'wt': ['wt%d' % i for i in range(1, 4)]} + self.assertRaises(ValueError, lreshape, df, spec) + + if __name__ == '__main__': - import nose nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], exit=False) - diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index adf5abdebe0df..e5ba885c8e28e 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -1,18 +1,20 @@ # being a bit too dynamic # pylint: disable=E1101 from itertools import izip +import datetime import numpy as np from pandas.util.decorators import cache_readonly import pandas.core.common as com from pandas.core.index import Index, MultiIndex -from pandas.core.series import Series +from pandas.core.series import Series, remove_na from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex -from pandas.tseries.frequencies import get_period_alias +from pandas.tseries.frequencies import get_period_alias, get_base_alias from pandas.tseries.offsets import DateOffset import pandas.tseries.tools as datetools +import pandas.lib as lib def _get_standard_kind(kind): return {'density' : 'kde'}.get(kind, kind) @@ -50,6 +52,8 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, mask = com.notnull(df) + marker = _get_marker_compat(marker) + for i, a in zip(range(n), df.columns): for j, b in zip(range(n), df.columns): if i == j: @@ -130,6 +134,12 @@ def _gcf(): import matplotlib.pyplot as plt return plt.gcf() +def _get_marker_compat(marker): + import matplotlib.lines as mlines + if marker not in mlines.lineMarkers: + return 'o' + return marker + def andrews_curves(data, class_column, ax=None, samples=200): """ Parameters: @@ -451,9 +461,12 @@ def _adorn_subplots(self): self.ax.set_title(self.title) if self._need_to_set_index: - xticklabels = [_stringify(key) for key in self.data.index] + labels = [_stringify(key) for key in self.data.index] + labels = dict(zip(range(len(self.data.index)), labels)) + for ax_ in self.axes: # ax_.set_xticks(self.xticks) + xticklabels = [labels.get(x, '') for x in ax_.get_xticks()] ax_.set_xticklabels(xticklabels, rotation=self.rot) @property @@ -477,13 +490,16 @@ def plt(self): _need_to_set_index = False - def _get_xticks(self): + def _get_xticks(self, convert_period=False): index = self.data.index is_datetype = index.inferred_type in ('datetime', 'date', 'datetime64') if self.use_index: - if index.is_numeric() or is_datetype: + if convert_period and isinstance(index, PeriodIndex): + index = index.to_timestamp() + x = index._mpl_repr() + elif index.is_numeric() or is_datetype: """ Matplotlib supports numeric values or datetime objects as xaxis values. Taking LBYL approach here, by the time @@ -562,49 +578,80 @@ def _post_plot_logic(self): if self.subplots and self.legend: self.axes[0].legend(loc='best') -class DatetimeConverter(object): - - @classmethod - def convert(cls, values, units, axis): - def try_parse(values): - try: - return datetools.to_datetime(values).toordinal() - except Exception: +try: # matplotlib is optional dependency + import matplotlib.units as units + import matplotlib.dates as dates + + class DatetimeConverter(dates.DateConverter): + + @staticmethod + def convert(values, unit, axis): + def try_parse(values): + try: + return datetools.to_datetime(values).toordinal() + except Exception: + return values + + if isinstance(values, (datetime.datetime, datetime.date)): + return values.toordinal() + elif isinstance(values, (datetime.time)): + return dates.date2num(values) + elif (com.is_integer(values) or com.is_float(values)): return values - - if (com.is_integer(values) or - com.is_float(values)): + elif isinstance(values, str): + return try_parse(values) + elif isinstance(values, Index): + return values.map(try_parse) + elif isinstance(values, (list, tuple, np.ndarray)): + return [try_parse(x) for x in values] return values - elif isinstance(values, str): - return try_parse(values) - elif isinstance(values, Index): - return values.map(try_parse) - return map(try_parse, values) + + units.registry[lib.Timestamp] = DatetimeConverter() + units.registry[datetime.date] = DatetimeConverter() + units.registry[datetime.datetime] = DatetimeConverter() +except ImportError: + pass class LinePlot(MPLPlot): def __init__(self, data, **kwargs): MPLPlot.__init__(self, data, **kwargs) - @property - def has_ts_index(self): + def _index_freq(self): from pandas.core.frame import DataFrame if isinstance(self.data, (Series, DataFrame)): - if isinstance(self.data.index, (DatetimeIndex, PeriodIndex)): - has_freq = (hasattr(self.data.index, 'freq') and - self.data.index.freq is not None) - has_inferred = (hasattr(self.data.index, 'inferred_freq') and - self.data.index.inferred_freq is not None) - return has_freq or has_inferred - return False + freq = (getattr(self.data.index, 'freq', None) + or getattr(self.data.index, 'inferred_freq', None)) + return freq + + def _is_dynamic_freq(self, freq): + if isinstance(freq, DateOffset): + freq = freq.rule_code + else: + freq = get_base_alias(freq) + freq = get_period_alias(freq) + return freq is not None + + def _use_dynamic_x(self): + freq = self._index_freq() + + ax, _ = self._get_ax_and_style(0) + ax_freq = getattr(ax, 'freq', None) + if freq is None: # convert irregular if axes has freq info + freq = ax_freq + else: # do not use tsplot if irregular was plotted first + if (ax_freq is None) and (len(ax.get_lines()) > 0): + return False + + return (freq is not None) and self._is_dynamic_freq(freq) def _make_plot(self): # this is slightly deceptive - if self.use_index and self.has_ts_index: + if self.use_index and self._use_dynamic_x(): data = self._maybe_convert_index(self.data) self._make_ts_plot(data) else: - x = self._get_xticks() + x = self._get_xticks(convert_period=True) plotf = self._get_plot_function() @@ -619,12 +666,9 @@ def _make_plot(self): if mask.any(): y = np.ma.array(y) y = np.ma.masked_where(mask, y) + plotf(ax, x, y, style, label=label, **self.kwds) ax.grid(self.grid) - idx = getattr(self.data, 'index', None) - if isinstance(idx, DatetimeIndex) or (idx is not None and - idx.inferred_type == 'datetime'): - ax.get_xaxis().converter = DatetimeConverter def _maybe_convert_index(self, data): # tsplot converts automatically, but don't want to convert index @@ -632,16 +676,23 @@ def _maybe_convert_index(self, data): from pandas.core.frame import DataFrame if (isinstance(data.index, DatetimeIndex) and isinstance(data, DataFrame)): - freq = getattr(data.index, 'freqstr', None) - - freq = get_period_alias(freq) + freq = getattr(data.index, 'freq', None) - if freq is None and hasattr(data.index, 'inferred_freq'): - freq = data.index.inferred_freq + if freq is None: + freq = getattr(data.index, 'inferred_freq', None) if isinstance(freq, DateOffset): freq = freq.rule_code + freq = get_period_alias(freq) + + if freq is None: + ax, _ = self._get_ax_and_style(0) + freq = getattr(ax, 'freq', None) + + if freq is None: + raise ValueError('Could not get frequency alias for plotting') + data = DataFrame(data.values, index=data.index.to_period(freq=freq), columns=data.columns) @@ -679,7 +730,7 @@ def _post_plot_logic(self): else: self.axes[0].legend(loc='best') - condition = (not self.has_ts_index + condition = (not self._use_dynamic_x and df.index.is_all_dates and not self.subplots or (self.subplots and self.sharex)) @@ -994,6 +1045,7 @@ def boxplot(data, column=None, by=None, ax=None, fontsize=None, def plot_group(grouped, ax): keys, values = zip(*grouped) keys = [_stringify(x) for x in keys] + values = [remove_na(v) for v in values] ax.boxplot(values, **kwds) if kwds.get('vert', 1): ax.set_xticklabels(keys, rotation=rot, fontsize=fontsize) @@ -1030,7 +1082,8 @@ def plot_group(grouped, ax): # Return boxplot dict in single plot case - bp = ax.boxplot(list(data[cols].values.T), **kwds) + clean_values = [remove_na(x) for x in data[cols].values.T] + bp = ax.boxplot(clean_values, **kwds) if kwds.get('vert', 1): ax.set_xticklabels(keys, rotation=rot, fontsize=fontsize) else: @@ -1095,7 +1148,8 @@ def plot_group(group, ax): def hist_frame(data, grid=True, xlabelsize=None, xrot=None, - ylabelsize=None, yrot=None, ax=None, **kwds): + ylabelsize=None, yrot=None, ax=None, + sharex=False, sharey=False, **kwds): """ Draw Histogram the DataFrame's series using matplotlib / pylab. @@ -1112,6 +1166,8 @@ def hist_frame(data, grid=True, xlabelsize=None, xrot=None, yrot : float, default None rotation of y axis labels ax : matplotlib axes object, default None + sharex : bool, if True, the X axis will be shared amongst all subplots. + sharey : bool, if True, the Y axis will be shared amongst all subplots. kwds : other plotting keyword arguments To be passed to hist function """ @@ -1123,7 +1179,8 @@ def hist_frame(data, grid=True, xlabelsize=None, xrot=None, rows += 1 else: cols += 1 - _, axes = _subplots(nrows=rows, ncols=cols, ax=ax, squeeze=False) + _, axes = _subplots(nrows=rows, ncols=cols, ax=ax, squeeze=False, + sharex=sharex, sharey=sharey) for i, col in enumerate(com._try_sort(data.columns)): ax = axes[i / cols][i % cols] @@ -1312,7 +1369,7 @@ def _subplots(nrows=1, ncols=1, sharex=False, sharey=False, squeeze=True, sharex : bool If True, the X axis will be shared amongst all subplots. - sharex : bool + sharey : bool If True, the Y axis will be shared amongst all subplots. squeeze : bool diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 8b87f8ccbf820..98f9fbe3c5b6c 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -246,20 +246,37 @@ def _get_freq_str(base, mult=1): 'BA' : 'A', 'AS' : 'A', 'BAS' : 'A', - 'MS' : 'M' + 'MS' : 'M', + 'D' : 'D', + 'B' : 'B', + 'T' : 'T', + 'S' : 'S', + 'H' : 'H', + 'Q' : 'Q', + 'A' : 'A', + 'W' : 'W', + 'M' : 'M' } need_suffix = ['QS', 'BQ', 'BQS', 'AS', 'BA', 'BAS'] -months = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', - 'OCT', 'NOV', 'DEC'] -for prefix in need_suffix: - for m in months: - _offset_to_period_map['%s-%s' % (prefix, m)] = \ - _offset_to_period_map[prefix] +_months = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', + 'OCT', 'NOV', 'DEC'] +for __prefix in need_suffix: + for _m in _months: + _offset_to_period_map['%s-%s' % (__prefix, _m)] = \ + _offset_to_period_map[__prefix] +for __prefix in ['A', 'Q']: + for _m in _months: + _alias = '%s-%s' % (__prefix, _m) + _offset_to_period_map[_alias] = _alias + +_days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] +for _d in _days: + _offset_to_period_map['W-%s' % _d] = 'W-%s' % _d def get_period_alias(offset_str): """ alias to closest period strings BQ->Q etc""" - return _offset_to_period_map.get(offset_str, offset_str) + return _offset_to_period_map.get(offset_str, None) _rule_aliases = { # Legacy rules that will continue to map to their original values @@ -302,11 +319,11 @@ def get_period_alias(offset_str): 'us': 'U' } -for i, weekday in enumerate(['MON', 'TUE', 'WED', 'THU', 'FRI']): - for iweek in xrange(4): - name = 'WOM-%d%s' % (iweek + 1, weekday) - _offset_map[name] = offsets.WeekOfMonth(week=iweek, weekday=i) - _rule_aliases[name.replace('-', '@')] = name +for _i, _weekday in enumerate(['MON', 'TUE', 'WED', 'THU', 'FRI']): + for _iweek in xrange(4): + _name = 'WOM-%d%s' % (_iweek + 1, _weekday) + _offset_map[_name] = offsets.WeekOfMonth(week=_iweek, weekday=_i) + _rule_aliases[_name.replace('-', '@')] = _name _legacy_reverse_map = dict((v, k) for k, v in _rule_aliases.iteritems()) @@ -542,8 +559,8 @@ def get_standard_freq(freq): } _reverse_period_code_map = {} -for k, v in _period_code_map.iteritems(): - _reverse_period_code_map[v] = k +for _k, _v in _period_code_map.iteritems(): + _reverse_period_code_map[_v] = _k # Additional aliases _period_code_map.update({ diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 1c4af41ff90e3..ae212902fb494 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -484,7 +484,7 @@ def append(self, other): to_concat = [x.values if isinstance(x, Index) else x for x in to_concat] - return Index(_concat(to_concat), name=name) + return Index(com._concat_compat(to_concat), name=name) def get_duplicates(self): values = Index.get_duplicates(self) @@ -775,7 +775,7 @@ def _fast_union(self, other): if left_end < right_end: loc = right.searchsorted(left_end, side='right') right_chunk = right.values[loc:] - dates = _concat((left.values, right_chunk)) + dates = com._concat_compat((left.values, right_chunk)) return self._view_like(dates) else: return left @@ -1371,11 +1371,3 @@ def _time_to_micros(time): return 1000000 * seconds + time.microsecond -def _concat(to_concat): - if all(x.dtype == _NS_DTYPE for x in to_concat): - # work around NumPy 1.6 bug - new_values = np.concatenate([x.view('i8') for x in to_concat]) - return new_values.view(_NS_DTYPE) - else: - return np.concatenate(to_concat) - diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index b03d8febbfbc2..f35a85970136a 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -221,10 +221,11 @@ def freqstr(self): return repr(self) if self.n != 1: - return '%d%s' % (self.n, code) + fstr = '%d%s' % (self.n, code) else: - return code + fstr = code + return fstr class BusinessDay(CacheableOffset, DateOffset): """ @@ -261,6 +262,54 @@ def __repr__(self): out += '>' return out + @property + def freqstr(self): + try: + code = self.rule_code + except NotImplementedError: + return repr(self) + + if self.n != 1: + fstr = '%d%s' % (self.n, code) + else: + fstr = code + + if self.offset: + fstr += self._offset_str() + + return fstr + + def _offset_str(self): + def get_str(td): + off_str = '' + if td.days > 0: + off_str += str(td.days) + 'D' + if td.seconds > 0: + s = td.seconds + hrs = int(s / 3600) + if hrs != 0: + off_str += str(hrs) + 'H' + s -= hrs * 3600 + mts = int(s / 60) + if mts != 0: + off_str += str(mts) + 'Min' + s -= mts * 60 + if s != 0: + off_str += str(s) + 's' + if td.microseconds > 0: + off_str += str(td.microseconds) + 'us' + return off_str + + if isinstance(self.offset, timedelta): + zero = timedelta(0, 0, 0) + if self.offset >= zero: + off_str = '+' + get_str(self.offset) + else: + off_str = '-' + get_str(-self.offset) + return off_str + else: + return '+' + repr(self.offset) + def isAnchored(self): return (self.n == 1) diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 22e571acef416..0a8c54e1c257f 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -1,6 +1,5 @@ # pylint: disable=E1101,E1103,W0232 - -from datetime import datetime +from datetime import datetime, date import numpy as np from pandas.tseries.frequencies import (get_freq_code as _gfc, to_offset, @@ -104,6 +103,10 @@ def __init__(self, value=None, freq=None, ordinal=None, dt = value if freq is None: raise ValueError('Must supply freq for datetime value') + elif isinstance(value, date): + dt = datetime(year=value.year, month=value.month, day=value.day) + if freq is None: + raise ValueError('Must supply freq for datetime value') else: msg = "Value must be Period, string, integer, or datetime" raise ValueError(msg) diff --git a/pandas/tseries/plotting.py b/pandas/tseries/plotting.py index 8ee78f4df652e..152f677a5b03e 100644 --- a/pandas/tseries/plotting.py +++ b/pandas/tseries/plotting.py @@ -43,41 +43,60 @@ def tsplot(series, plotf, **kwargs): """ # Used inferred freq is possible, need a test case for inferred - freq = getattr(series.index, 'freq', None) - if freq is None and hasattr(series.index, 'inferred_freq'): - freq = series.index.inferred_freq + if 'ax' in kwargs: + ax = kwargs.pop('ax') + else: + import matplotlib.pyplot as plt + ax = plt.gca() - if isinstance(freq, DateOffset): - freq = freq.rule_code + freq = _get_freq(ax, series) + # resample against axes freq if necessary + if freq is None: # pragma: no cover + raise ValueError('Cannot use dynamic axis without frequency info') else: - freq = frequencies.get_base_alias(freq) + ax_freq = getattr(ax, 'freq', None) + if (ax_freq is not None) and (freq != ax_freq): + if frequencies.is_subperiod(freq, ax_freq): # downsample + how = kwargs.pop('how', 'last') + series = series.resample(ax_freq, how=how) + elif frequencies.is_superperiod(freq, ax_freq): + series = series.resample(ax_freq) + else: # one freq is weekly + how = kwargs.pop('how', 'last') + series = series.resample('D', how=how, fill_method='pad') + series = series.resample(ax_freq, how=how, fill_method='pad') + freq = ax_freq - freq = frequencies.get_period_alias(freq) # Convert DatetimeIndex to PeriodIndex if isinstance(series.index, DatetimeIndex): series = series.to_period(freq=freq) - if freq != series.index.freq: - series = series.asfreq(freq) - style = kwargs.pop('style', None) - if 'ax' in kwargs: - ax = kwargs.pop('ax') - else: - import matplotlib.pyplot as plt - ax = plt.gca() - # Specialized ts plotting attributes for Axes ax.freq = freq xaxis = ax.get_xaxis() xaxis.freq = freq - xaxis.converter = DateConverter ax.legendlabels = [kwargs.get('label', None)] ax.view_interval = None ax.date_axis_info = None # format args and lot + args = _maybe_mask(series) + + if style is not None: + args.append(style) + + plotf(ax, *args, **kwargs) + + format_dateaxis(ax, ax.freq) + + left, right = _get_xlim(ax.get_lines()) + ax.set_xlim(left, right) + + return ax + +def _maybe_mask(series): mask = isnull(series) if mask.any(): masked_array = np.ma.array(series.values) @@ -85,19 +104,37 @@ def tsplot(series, plotf, **kwargs): args = [series.index, masked_array] else: args = [series.index, series] + return args - if style is not None: - args.append(style) +def _get_freq(ax, series): + # get frequency from data + freq = getattr(series.index, 'freq', None) + if freq is None: + freq = getattr(series.index, 'inferred_freq', None) - plotf(ax, *args, **kwargs) + ax_freq = getattr(ax, 'freq', None) - format_dateaxis(ax, ax.freq) + # use axes freq if no data freq + if freq is None: + freq = ax_freq - left = series.index[0] #get_datevalue(series.index[0], freq) - right = series.index[-1] #get_datevalue(series.index[-1], freq) - ax.set_xlim(left, right) + # get the period frequency + if isinstance(freq, DateOffset): + freq = freq.rule_code + else: + freq = frequencies.get_base_alias(freq) - return ax + freq = frequencies.get_period_alias(freq) + + return freq + +def _get_xlim(lines): + left, right = np.inf, -np.inf + for l in lines: + x = l.get_xdata() + left = min(x[0].ordinal, left) + right = max(x[-1].ordinal, right) + return left, right def get_datevalue(date, freq): if isinstance(date, Period): @@ -136,7 +173,6 @@ def _get_default_annual_spacing(nyears): (min_spacing, maj_spacing) = (factor * 20, factor * 100) return (min_spacing, maj_spacing) - def period_break(dates, period): """ Returns the indices where the given period changes. @@ -702,12 +738,24 @@ def format_dateaxis(subplot, freq): subplot.xaxis.set_minor_formatter(minformatter) pylab.draw_if_interactive() -class DateConverter(object): - @classmethod - def convert(cls, values, units, axis): - if isinstance(values, (int, float, str, datetime, Period)): +import matplotlib.units as units +import matplotlib.dates as dates + +class PeriodConverter(dates.DateConverter): + + @staticmethod + def convert(values, units, axis): + if not hasattr(axis, 'freq'): + raise TypeError('Axis must have `freq` set to convert to Periods') + valid_types = (str, datetime, Period, pydt.date, pydt.time) + if (isinstance(values, valid_types) or com.is_integer(values) or + com.is_float(values)): return get_datevalue(values, axis.freq) if isinstance(values, Index): return values.map(lambda x: get_datevalue(x, axis.freq)) - return map(lambda x: get_datevalue(x, axis.freq), values) + if isinstance(values, (list, tuple, np.ndarray)): + return [get_datevalue(x, axis.freq) for x in values] + return values + +units.registry[Period] = PeriodConverter() diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py index 05f638afaebed..1311b9ca451a2 100644 --- a/pandas/tseries/tests/test_offsets.py +++ b/pandas/tseries/tests/test_offsets.py @@ -1389,6 +1389,13 @@ def test_dateoffset_misc(): assert(not offsets.DateOffset(months=2) == 2) +def test_freq_offsets(): + off = BDay(1, offset=timedelta(0, 1800)) + assert(off.freqstr == 'B+30Min') + + off = BDay(1, offset=timedelta(0, -1800)) + assert(off.freqstr == 'B-30Min') + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index c65615cd1d03e..fb09fec8bfd1d 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -7,7 +7,7 @@ """ from unittest import TestCase -from datetime import datetime, timedelta +from datetime import datetime, date, timedelta import unittest from numpy.ma.testutils import assert_equal @@ -168,6 +168,14 @@ def test_period_constructor(self): i1 = Period(ordinal=200701, freq='M') self.assertEqual(i1.year, 18695) + i1 = Period(datetime(2007, 1, 1), freq='M') + i2 = Period('200701', freq='M') + self.assertEqual(i1, i2) + + i1 = Period(date(2007, 1, 1), freq='M') + i2 = Period(datetime(2007, 1, 1), freq='M') + self.assertEqual(i1, i2) + self.assertRaises(ValueError, Period, ordinal=200701) def test_freq_str(self): diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tseries/tests/test_plotting.py index 84dadf03bf9eb..af399ee83e185 100644 --- a/pandas/tseries/tests/test_plotting.py +++ b/pandas/tseries/tests/test_plotting.py @@ -1,11 +1,12 @@ import os -from datetime import datetime, timedelta +from datetime import datetime, timedelta, date, time import unittest import nose import numpy as np from numpy.testing.decorators import slow +from numpy.testing import assert_array_equal from pandas import Index, Series, DataFrame, isnull, notnull @@ -41,33 +42,62 @@ def setUp(self): columns=['A', 'B', 'C']) for x in idx] - freq = ['S', 'T', 'H', 'D', 'W', 'M', 'Q-DEC', 'A'] + freq = ['S', 'T', 'H', 'D', 'W', 'M', 'Q-DEC', 'A', '1B30Min'] idx = [date_range('12/31/1999', freq=x, periods=100) for x in freq] self.datetime_ser = [Series(np.random.randn(len(x)), x) for x in idx] self.datetime_df = [DataFrame(np.random.randn(len(x), 3), index=x, columns=['A', 'B', 'C']) for x in idx] + @slow + def test_frame_inferred(self): + # inferred freq + import matplotlib.pyplot as plt + plt.close('all') + idx = date_range('1/1/1987', freq='MS', periods=100) + idx = DatetimeIndex(idx.values, freq=None) + df = DataFrame(np.random.randn(len(idx), 3), index=idx) + df.plot() + + # axes freq + idx = idx[0:40] + idx[45:99] + df2 = DataFrame(np.random.randn(len(idx), 3), index=idx) + df2.plot() + plt.close('all') + @slow def test_tsplot(self): from pandas.tseries.plotting import tsplot import matplotlib.pyplot as plt + plt.close('all') + ax = plt.gca() ts = tm.makeTimeSeries() plot_ax = tsplot(ts, plt.Axes.plot) self.assert_(plot_ax == ax) f = lambda *args, **kwds: tsplot(s, plt.Axes.plot, *args, **kwds) + plt.close('all') for s in self.period_ser: _check_plot_works(f, s.index.freq, ax=ax, series=s) + plt.close('all') for s in self.datetime_ser: _check_plot_works(f, s.index.freq.rule_code, ax=ax, series=s) + plt.close('all') plt.close('all') ax = ts.plot(style='k') self.assert_((0., 0., 0.) == ax.get_lines()[0].get_color()) + @slow + def test_high_freq(self): + freaks = ['ms', 'us'] + for freq in freaks: + rng = date_range('1/1/2012', periods=100000, freq=freq) + ser = Series(np.random.randn(len(rng)), rng) + _check_plot_works(ser.plot) + def test_get_datevalue(self): from pandas.tseries.plotting import get_datevalue self.assert_(get_datevalue(None, 'D') is None) @@ -226,8 +256,10 @@ def test_finder_daily(self): @slow def test_finder_quarterly(self): + import matplotlib.pyplot as plt xp = Period('1988Q1').ordinal yrs = [3.5, 11] + plt.close('all') for n in yrs: rng = period_range('1987Q2', periods=int(n * 4), freq='Q') ser = Series(np.random.randn(len(rng)), rng) @@ -242,8 +274,10 @@ def test_finder_quarterly(self): @slow def test_finder_monthly(self): + import matplotlib.pyplot as plt xp = Period('1988-1').ordinal yrs = [1.15, 2.5, 4, 11] + plt.close('all') for n in yrs: rng = period_range('1987Q2', periods=int(n * 12), freq='M') ser = Series(np.random.randn(len(rng)), rng) @@ -255,8 +289,12 @@ def test_finder_monthly(self): ax.set_xlim(vmin + 0.9, vmax) rs = xaxis.get_majorticklocs()[0] self.assertEqual(xp, rs) + plt.close('all') - + @slow + def test_finder_monthly_long(self): + import matplotlib.pyplot as plt + plt.close('all') rng = period_range('1988Q1', periods=24*12, freq='M') ser = Series(np.random.randn(len(rng)), rng) ax = ser.plot() @@ -268,6 +306,7 @@ def test_finder_monthly(self): @slow def test_finder_annual(self): import matplotlib.pyplot as plt + plt.close('all') xp = [1987, 1988, 1990, 1990, 1995, 2020, 2070, 2170] for i, nyears in enumerate([5, 10, 19, 49, 99, 199, 599, 1001]): rng = period_range('1987', periods=nyears, freq='A') @@ -424,6 +463,128 @@ def test_secondary_frame(self): self.assert_(axes[1].get_yaxis().get_ticks_position() == 'default') self.assert_(axes[2].get_yaxis().get_ticks_position() == 'right') + @slow + def test_mixed_freq_regular_first(self): + import matplotlib.pyplot as plt + plt.close('all') + s1 = tm.makeTimeSeries() + s2 = s1[[0, 5, 10, 11, 12, 13, 14, 15]] + s1.plot() + ax2 = s2.plot(style='g') + lines = ax2.get_lines() + idx1 = lines[0].get_xdata() + idx2 = lines[1].get_xdata() + self.assert_(idx1.equals(s1.index.to_period('B'))) + self.assert_(idx2.equals(s2.index.to_period('B'))) + left, right = ax2.get_xlim() + pidx = s1.index.to_period() + self.assert_(left == pidx[0].ordinal) + self.assert_(right == pidx[-1].ordinal) + plt.close('all') + + @slow + def test_mixed_freq_irregular_first(self): + import matplotlib.pyplot as plt + plt.close('all') + s1 = tm.makeTimeSeries() + s2 = s1[[0, 5, 10, 11, 12, 13, 14, 15]] + s2.plot(style='g') + ax = s1.plot() + self.assert_(not hasattr(ax, 'freq')) + lines = ax.get_lines() + x1 = lines[0].get_xdata() + assert_array_equal(x1, s2.index.asobject.values) + x2 = lines[1].get_xdata() + assert_array_equal(x2, s1.index.asobject.values) + plt.close('all') + + @slow + def test_mixed_freq_hf_first(self): + import matplotlib.pyplot as plt + plt.close('all') + idxh = date_range('1/1/1999', periods=365, freq='D') + idxl = date_range('1/1/1999', periods=12, freq='M') + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + high.plot() + ax = low.plot() + for l in ax.get_lines(): + self.assert_(l.get_xdata().freq == 'D') + + @slow + def test_mixed_freq_lf_first(self): + import matplotlib.pyplot as plt + plt.close('all') + idxh = date_range('1/1/1999', periods=365, freq='D') + idxl = date_range('1/1/1999', periods=12, freq='M') + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + low.plot() + ax = high.plot() + for l in ax.get_lines(): + self.assert_(l.get_xdata().freq == 'M') + + @slow + def test_mixed_freq_irreg_period(self): + ts = tm.makeTimeSeries() + irreg = ts[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 16, 17, 18, 29]] + rng = period_range('1/3/2000', periods=30, freq='B') + ps = Series(np.random.randn(len(rng)), rng) + irreg.plot() + ps.plot() + + @slow + def test_to_weekly_resampling(self): + import matplotlib.pyplot as plt + plt.close('all') + idxh = date_range('1/1/1999', periods=52, freq='W') + idxl = date_range('1/1/1999', periods=12, freq='M') + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + high.plot() + ax = low.plot() + for l in ax.get_lines(): + self.assert_(l.get_xdata().freq.startswith('W')) + + @slow + def test_from_weekly_resampling(self): + import matplotlib.pyplot as plt + plt.close('all') + idxh = date_range('1/1/1999', periods=52, freq='W') + idxl = date_range('1/1/1999', periods=12, freq='M') + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + low.plot() + ax = high.plot() + for l in ax.get_lines(): + self.assert_(l.get_xdata().freq == 'M') + + @slow + def test_irreg_dtypes(self): + import matplotlib.pyplot as plt + #date + idx = [date(2000, 1, 1), date(2000, 1, 5), date(2000, 1, 20)] + df = DataFrame(np.random.randn(len(idx), 3), Index(idx, dtype=object)) + _check_plot_works(df.plot) + + #np.datetime64 + idx = date_range('1/1/2000', periods=10) + idx = idx[[0, 2, 5, 9]].asobject + df = DataFrame(np.random.randn(len(idx), 3), idx) + _check_plot_works(df.plot) + + #time + plt.close('all') + inc = Series(np.random.randint(1, 15, 3)).cumsum().values + idx = [time(1, 1, i) for i in inc] + df = DataFrame(np.random.randn(len(idx), 3), idx) + ax = df.plot() + ticks = ax.get_xticks() + labels = ax.get_xticklabels() + td = dict(zip(ticks, labels)) + for i in range(3): + self.assert_(td[i].get_text() == str(idx[i])) + PNG_PATH = 'tmp.png' def _check_plot_works(f, freq=None, series=None, *args, **kwargs): import matplotlib.pyplot as plt @@ -431,19 +592,22 @@ def _check_plot_works(f, freq=None, series=None, *args, **kwargs): fig = plt.gcf() plt.clf() ax = fig.add_subplot(211) + orig_ax = kwargs.pop('ax', plt.gca()) + orig_axfreq = getattr(orig_ax, 'freq', None) + ret = f(*args, **kwargs) assert(ret is not None) # do something more intelligent - orig_ax = kwargs.pop('ax', plt.gca()) - if series is not None: # non-business + ax = kwargs.pop('ax', plt.gca()) + if series is not None: dfreq = series.index.freq if isinstance(dfreq, DateOffset): dfreq = dfreq.rule_code - #dfreq = frequencies.offset_to_period_alias(dfreq) - assert(orig_ax.freq == dfreq) + if orig_axfreq is None: + assert(ax.freq == dfreq) - if freq is not None: - assert(orig_ax.freq == freq) + if freq is not None and orig_axfreq is None: + assert(ax.freq == freq) ax = fig.add_subplot(212) try: diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 06fc71c2cc65f..5f6a00a27699f 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -384,6 +384,12 @@ def test_frame_add_datetime64_column(self): df['A'] = rng self.assert_(np.issubdtype(df['A'].dtype, np.dtype('M8[ns]'))) + def test_frame_datetime64_pre1900_repr(self): + df = DataFrame({'year': date_range('1/1/1700', periods=50, + freq='A-DEC')}) + # it works! + repr(df) + def test_frame_add_datetime64_col_other_units(self): n = 100 @@ -898,6 +904,24 @@ def test_timestamp_fields(self): self.assertEqual(idx.freq, Timestamp(idx[0], idx.freq).freq) self.assertEqual(idx.freqstr, Timestamp(idx[0], idx.freq).freqstr) + def test_timestamp_date_out_of_range(self): + self.assertRaises(ValueError, Timestamp, '1676-01-01') + self.assertRaises(ValueError, Timestamp, '2263-01-01') + + # 1475 + self.assertRaises(ValueError, DatetimeIndex, ['1400-01-01']) + self.assertRaises(ValueError, DatetimeIndex, [datetime(1400, 1, 1)]) + + def test_timestamp_repr(self): + # pre-1900 + stamp = Timestamp('1850-01-01', tz='US/Eastern') + repr(stamp) + + iso8601 = '1850-01-01 01:23:45.012345' + stamp = Timestamp(iso8601, tz='US/Eastern') + result = repr(stamp) + self.assert_(iso8601 in result) + def test_datetimeindex_integers_shift(self): rng = date_range('1/1/2000', periods=20) @@ -918,7 +942,6 @@ def test_astype_object(self): self.assert_(np.array_equal(casted, exp_values)) - def test_catch_infinite_loop(self): offset = datetools.DateOffset(minute=5) # blow up, don't loop forever diff --git a/setup.py b/setup.py index ba80a464060dc..2c62f7051bfce 100755 --- a/setup.py +++ b/setup.py @@ -174,9 +174,9 @@ MAJOR = 0 MINOR = 8 MICRO = 0 -ISRELEASED = False +ISRELEASED = True VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO) -QUALIFIER = 'b2' +QUALIFIER = 'rc2' FULLVERSION = VERSION if not ISRELEASED: