From 91afd548fa1e33ea1fc73b49f9766921547a2e2b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 21 Jun 2012 12:45:44 -0400 Subject: [PATCH 01/42] BUG: minimal out of bounds checking on future/past timestamps, gotcha docs, pre-1900 repr --- doc/source/gotchas.rst | 24 +++++++++++++++ pandas/src/datetime.pyx | 40 ++++++++++++++++++++++--- pandas/tseries/tests/test_timeseries.py | 15 +++++++++- 3 files changed, 74 insertions(+), 5 deletions(-) diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index 57d6471a8fe7b..f4c0eae4cfca0 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -217,3 +217,27 @@ passed in the index, thus finding the integers ``0`` and ``1``. While it would be possible to insert some logic to check whether a passed sequence is all contained in the index, that logic would exact a very high cost in large data sets. + +Timestamp limitations +--------------------- + +Minimum and maximum timestamps +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Since pandas represents timestamps in nanosecond resolution, the timespan that +can be represented using a 64-bit integer is limited to approximately 584 years: + +.. ipython:: python + + begin = Timestamp(-9223285636854775809L) + begin + end = Timestamp(np.iinfo(np.int64).max) + end + +If you need to represent time series data outside the nanosecond timespan, use +PeriodIndex: + +.. ipython:: python + + span = period_range('1215-01-01', '1381-01-01', freq='D') + span diff --git a/pandas/src/datetime.pyx b/pandas/src/datetime.pyx index ec50275ccc4a6..38adf46285e0d 100644 --- a/pandas/src/datetime.pyx +++ b/pandas/src/datetime.pyx @@ -65,6 +65,7 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None): return result + # Python front end to C extension type _Timestamp # This serves as the box for datetime64 class Timestamp(_Timestamp): @@ -101,10 +102,27 @@ class Timestamp(_Timestamp): return ts_base def __repr__(self): - result = self.strftime('' + result = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (self.year, self.month, + self.day, self.hour, + self.minute, self.second) + + if self.nanosecond != 0: + nanos = self.nanosecond + 1000 * self.microsecond + result += '.%.9d' % nanos + elif self.microsecond != 0: + result += '.%.6d' % self.microsecond + + try: + result += self.strftime('%z') + if self.tzinfo: + result += self.strftime(' %%Z, tz=%s' % self.tzinfo.zone) + except ValueError: + year2000 = self.replace(year=2000) + result += year2000.strftime('%z') + if self.tzinfo: + result += year2000.strftime(' %%Z, tz=%s' % self.tzinfo.zone) + + return '' % result @property def tz(self): @@ -507,6 +525,7 @@ cpdef convert_to_tsobject(object ts, object tz=None): obj.tzinfo = ts.tzinfo if obj.tzinfo is not None: obj.value -= _delta_to_nanoseconds(obj.tzinfo._utcoffset) + _check_dts_bounds(obj.value, &obj.dts) return obj elif PyDate_Check(ts): obj.value = _date_to_datetime64(ts, &obj.dts) @@ -514,6 +533,9 @@ cpdef convert_to_tsobject(object ts, object tz=None): raise ValueError("Could not construct Timestamp from argument %s" % type(ts)) + if obj.value != NPY_NAT: + _check_dts_bounds(obj.value, &obj.dts) + if tz is not None: if tz is pytz.utc: obj.tzinfo = tz @@ -530,6 +552,16 @@ cpdef convert_to_tsobject(object ts, object tz=None): return obj +cdef int64_t _NS_LOWER_BOUND = -9223285636854775809LL +cdef int64_t _NS_UPPER_BOUND = -9223372036854775807LL + +cdef inline _check_dts_bounds(int64_t value, pandas_datetimestruct *dts): + cdef pandas_datetimestruct dts2 + if dts.year <= 1677 or dts.year >= 2262: + pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts2) + if dts2.year != dts.year: + raise ValueError('Out of bounds timestamp in year: %s' % dts.year) + # elif isinstance(ts, _Timestamp): # tmp = ts # obj.value = (<_Timestamp> ts).value diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 06fc71c2cc65f..9c6542d4c2edc 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -898,6 +898,20 @@ def test_timestamp_fields(self): self.assertEqual(idx.freq, Timestamp(idx[0], idx.freq).freq) self.assertEqual(idx.freqstr, Timestamp(idx[0], idx.freq).freqstr) + def test_timestamp_date_out_of_range(self): + self.assertRaises(ValueError, Timestamp, '1676-01-01') + self.assertRaises(ValueError, Timestamp, '2263-01-01') + + def test_timestamp_repr(self): + # pre-1900 + stamp = Timestamp('1850-01-01', tz='US/Eastern') + repr(stamp) + + iso8601 = '1850-01-01 01:23:45.012345' + stamp = Timestamp(iso8601, tz='US/Eastern') + result = repr(stamp) + self.assert_(iso8601 in result) + def test_datetimeindex_integers_shift(self): rng = date_range('1/1/2000', periods=20) @@ -918,7 +932,6 @@ def test_astype_object(self): self.assert_(np.array_equal(casted, exp_values)) - def test_catch_infinite_loop(self): offset = datetools.DateOffset(minute=5) # blow up, don't loop forever From 5fbffe4d1cd8b422119eb2b86c8a3f58e940f18c Mon Sep 17 00:00:00 2001 From: Chang She Date: Thu, 21 Jun 2012 12:36:35 -0400 Subject: [PATCH 02/42] BUG: sub-second plotting was not working --- pandas/tools/plotting.py | 19 ++++++++++++------- pandas/tseries/frequencies.py | 24 ++++++++++++++++++++++-- pandas/tseries/plotting.py | 1 - pandas/tseries/tests/test_plotting.py | 9 +++++++++ 4 files changed, 43 insertions(+), 10 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index adf5abdebe0df..fa351207f0059 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -10,7 +10,7 @@ from pandas.core.series import Series from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex -from pandas.tseries.frequencies import get_period_alias +from pandas.tseries.frequencies import get_period_alias, get_base_alias from pandas.tseries.offsets import DateOffset import pandas.tseries.tools as datetools @@ -590,14 +590,19 @@ def __init__(self, data, **kwargs): def has_ts_index(self): from pandas.core.frame import DataFrame if isinstance(self.data, (Series, DataFrame)): - if isinstance(self.data.index, (DatetimeIndex, PeriodIndex)): - has_freq = (hasattr(self.data.index, 'freq') and - self.data.index.freq is not None) - has_inferred = (hasattr(self.data.index, 'inferred_freq') and - self.data.index.inferred_freq is not None) - return has_freq or has_inferred + freq = (getattr(self.data.index, 'freq', None) + or getattr(self.data.index, 'inferred_freq', None)) + return (freq is not None) and self._has_dynamic_index_freq(freq) return False + def _has_dynamic_index_freq(self, freq): + if isinstance(freq, DateOffset): + freq = freq.rule_code + else: + freq = get_base_alias(freq) + freq = get_period_alias(freq) + return freq is not None + def _make_plot(self): # this is slightly deceptive if self.use_index and self.has_ts_index: diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 8b87f8ccbf820..433e88e2d7ecb 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -246,7 +246,16 @@ def _get_freq_str(base, mult=1): 'BA' : 'A', 'AS' : 'A', 'BAS' : 'A', - 'MS' : 'M' + 'MS' : 'M', + 'D' : 'D', + 'B' : 'B', + 'T' : 'T', + 'S' : 'S', + 'H' : 'H', + 'Q' : 'Q', + 'A' : 'A', + 'W' : 'W', + 'M' : 'M' } need_suffix = ['QS', 'BQ', 'BQS', 'AS', 'BA', 'BAS'] @@ -257,9 +266,20 @@ def _get_freq_str(base, mult=1): _offset_to_period_map['%s-%s' % (prefix, m)] = \ _offset_to_period_map[prefix] +months = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', + 'OCT', 'NOV', 'DEC'] +for prefix in ['A', 'Q']: + for m in months: + alias = '%s-%s' % (prefix, m) + _offset_to_period_map[alias] = alias + +_days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] +for d in _days: + _offset_to_period_map['W-%s' % d] = 'W-%s' % d + def get_period_alias(offset_str): """ alias to closest period strings BQ->Q etc""" - return _offset_to_period_map.get(offset_str, offset_str) + return _offset_to_period_map.get(offset_str, None) _rule_aliases = { # Legacy rules that will continue to map to their original values diff --git a/pandas/tseries/plotting.py b/pandas/tseries/plotting.py index 8ee78f4df652e..79823e8c12dca 100644 --- a/pandas/tseries/plotting.py +++ b/pandas/tseries/plotting.py @@ -136,7 +136,6 @@ def _get_default_annual_spacing(nyears): (min_spacing, maj_spacing) = (factor * 20, factor * 100) return (min_spacing, maj_spacing) - def period_break(dates, period): """ Returns the indices where the given period changes. diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tseries/tests/test_plotting.py index 84dadf03bf9eb..81f972debe6c2 100644 --- a/pandas/tseries/tests/test_plotting.py +++ b/pandas/tseries/tests/test_plotting.py @@ -68,6 +68,14 @@ def test_tsplot(self): ax = ts.plot(style='k') self.assert_((0., 0., 0.) == ax.get_lines()[0].get_color()) + @slow + def test_high_freq(self): + freaks = ['ms', 'us'] + for freq in freaks: + rng = date_range('1/1/2012', periods=100000, freq=freq) + ser = Series(np.random.randn(len(rng)), rng) + _check_plot_works(ser.plot) + def test_get_datevalue(self): from pandas.tseries.plotting import get_datevalue self.assert_(get_datevalue(None, 'D') is None) @@ -268,6 +276,7 @@ def test_finder_monthly(self): @slow def test_finder_annual(self): import matplotlib.pyplot as plt + plt.close('all') xp = [1987, 1988, 1990, 1990, 1995, 2020, 2070, 2170] for i, nyears in enumerate([5, 10, 19, 49, 99, 199, 599, 1001]): rng = period_range('1987', periods=nyears, freq='A') From 32d73012285d1228665b05f01059bd82aea04ceb Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 21 Jun 2012 13:02:51 -0400 Subject: [PATCH 03/42] BUG: nanosecond out-of-bounds checking in array conversions. close #1475 --- pandas/src/datetime.pyx | 11 ++++++++++- pandas/tseries/tests/test_timeseries.py | 4 ++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/pandas/src/datetime.pyx b/pandas/src/datetime.pyx index 38adf46285e0d..17f35a73501aa 100644 --- a/pandas/src/datetime.pyx +++ b/pandas/src/datetime.pyx @@ -560,7 +560,11 @@ cdef inline _check_dts_bounds(int64_t value, pandas_datetimestruct *dts): if dts.year <= 1677 or dts.year >= 2262: pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts2) if dts2.year != dts.year: - raise ValueError('Out of bounds timestamp in year: %s' % dts.year) + fmt = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (dts.year, dts.month, + dts.day, dts.hour, + dts.min, dts.sec) + + raise ValueError('Out of bounds nanosecond timestamp: %s' % fmt) # elif isinstance(ts, _Timestamp): # tmp = ts @@ -645,8 +649,10 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False): iresult[i] = iNaT elif PyDateTime_Check(val): iresult[i] = _pydatetime_to_dts(val, &dts) + _check_dts_bounds(iresult[i], &dts) elif PyDate_Check(val): iresult[i] = _date_to_datetime64(val, &dts) + _check_dts_bounds(iresult[i], &dts) elif util.is_datetime64_object(val): iresult[i] = _get_datetime64_nanos(val) elif util.is_integer_object(val): @@ -659,6 +665,9 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False): result[i] = parse(val, dayfirst=dayfirst) except Exception: raise TypeError + pandas_datetime_to_datetimestruct(iresult[i], PANDAS_FR_ns, + &dts) + _check_dts_bounds(iresult[i], &dts) return result except TypeError: oresult = np.empty(n, dtype=object) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 9c6542d4c2edc..f04f0a5b8259c 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -902,6 +902,10 @@ def test_timestamp_date_out_of_range(self): self.assertRaises(ValueError, Timestamp, '1676-01-01') self.assertRaises(ValueError, Timestamp, '2263-01-01') + # 1475 + self.assertRaises(ValueError, DatetimeIndex, ['1400-01-01']) + self.assertRaises(ValueError, DatetimeIndex, [datetime(1400, 1, 1)]) + def test_timestamp_repr(self): # pre-1900 stamp = Timestamp('1850-01-01', tz='US/Eastern') From 1372d0f09f8b5b625b20587d3608298a1fd44b98 Mon Sep 17 00:00:00 2001 From: Wouter Overmeire Date: Thu, 21 Jun 2012 20:36:37 +0200 Subject: [PATCH 04/42] ENH: Add sharex and sharey to DataFrame.hist() --- pandas/tests/test_graphics.py | 3 +++ pandas/tools/plotting.py | 8 ++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 65fc503f2eb7f..57bc17c5c33b6 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -223,6 +223,9 @@ def test_hist(self): df = DataFrame(np.random.randn(100, 6)) _check_plot_works(df.hist) + #make sure sharex, sharey is handled + _check_plot_works(df.hist, sharex=True, sharey=True) + #make sure kwargs are handled ser = df[0] xf, yf = 20, 20 diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index adf5abdebe0df..a669b15596fe6 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -1095,7 +1095,8 @@ def plot_group(group, ax): def hist_frame(data, grid=True, xlabelsize=None, xrot=None, - ylabelsize=None, yrot=None, ax=None, **kwds): + ylabelsize=None, yrot=None, ax=None, + sharex=False, sharey=False, **kwds): """ Draw Histogram the DataFrame's series using matplotlib / pylab. @@ -1112,6 +1113,8 @@ def hist_frame(data, grid=True, xlabelsize=None, xrot=None, yrot : float, default None rotation of y axis labels ax : matplotlib axes object, default None + sharex : bool, if True, the X axis will be shared amongst all subplots. + sharey : bool, if True, the Y axis will be shared amongst all subplots. kwds : other plotting keyword arguments To be passed to hist function """ @@ -1123,7 +1126,8 @@ def hist_frame(data, grid=True, xlabelsize=None, xrot=None, rows += 1 else: cols += 1 - _, axes = _subplots(nrows=rows, ncols=cols, ax=ax, squeeze=False) + _, axes = _subplots(nrows=rows, ncols=cols, ax=ax, squeeze=False, + sharex=sharex, sharey=sharey) for i, col in enumerate(com._try_sort(data.columns)): ax = axes[i / cols][i % cols] From 3815815639753b82afbd192cd1f01ab3f7a735bd Mon Sep 17 00:00:00 2001 From: Wouter Overmeire Date: Thu, 21 Jun 2012 20:38:36 +0200 Subject: [PATCH 05/42] DOC: Fix typo. --- pandas/tools/plotting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index a669b15596fe6..7ca432e89ecd5 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -1316,7 +1316,7 @@ def _subplots(nrows=1, ncols=1, sharex=False, sharey=False, squeeze=True, sharex : bool If True, the X axis will be shared amongst all subplots. - sharex : bool + sharey : bool If True, the Y axis will be shared amongst all subplots. squeeze : bool From 214c24b44dba241a4fce0fc988cedfde135bde05 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 21 Jun 2012 15:52:40 -0400 Subject: [PATCH 06/42] DOC: finish updating release notes --- RELEASE.rst | 15 ++++++++++++++- doc/source/whatsnew/v0.8.0.txt | 6 ++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 7fe8b017cf71b..d187b211e3f2f 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -86,7 +86,9 @@ pandas 0.8.0 - Add lag plot (#1440) - Add autocorrelation_plot (#1425) - Add support for tox and Travis CI (#1382) - - Add support for ordered factors and use in GroupBy (#292) + - Add support for Categorical use in GroupBy (#292) + - Add ``any`` and ``all`` methods to DataFrame (#1416) + - Add ``secondary_y`` option to Series.plot **Improvements to existing features** @@ -124,9 +126,16 @@ pandas 0.8.0 - Add ``convert_dtype`` option to Series.apply to be able to leave data as dtype=object (#1414) - Can specify all index level names in concat (#1419) + - Add ``dialect`` keyword to parsers for quoting conventions (#1363) + - Enable DataFrame[bool_DataFrame] += value (#1366) + - Add ``retries`` argument to ``get_data_yahoo`` to try to prevent Yahoo! API + 404s (#826) + - Improve performance of reshaping by using O(N) categorical sorting **API Changes** + - Rename Factor to Categorical and add improvements. Numerous Categorical bug + fixes - Frequency name overhaul, WEEKDAY/EOM and rules with @ deprecated. get_legacy_offset_name backwards compatibility function added - Raise ValueError in DataFrame.__nonzero__, so "if df" no longer works @@ -190,6 +199,10 @@ pandas 0.8.0 - Fix outer/inner DataFrame.join with non-unique indexes (#1421) - Fix MultiIndex groupby bugs with empty lower levels (#1401) - Calling fillna with a Series will have same behavior as with dict (#1486) + - SparseSeries reduction bug (#1375) + - Fix unicode serialization issue in HDFStore (#1361) + - Pass keywords to pyplot.boxplot in DataFrame.boxplot (#1493) + - Bug fixes in MonthBegin (#1483) pandas 0.7.3 ============ diff --git a/doc/source/whatsnew/v0.8.0.txt b/doc/source/whatsnew/v0.8.0.txt index 5ea0be0ddd971..cbf094752d4a9 100644 --- a/doc/source/whatsnew/v0.8.0.txt +++ b/doc/source/whatsnew/v0.8.0.txt @@ -111,8 +111,10 @@ index duplication in many-to-many joins) Other new features ~~~~~~~~~~~~~~~~~~ -- New :ref:`cut ` function (like R's cut function) for - computing a categorical variable from a continuous variable by binning values +- New :ref:`cut ` and ``qcut`` functions (like R's cut + function) for computing a categorical variable from a continuous variable by + binning values either into value-based (``cut``) or quantile-based (``qcut``) + bins - Add :ref:`limit ` argument to fillna/reindex - More flexible multiple function application in GroupBy, and can pass list (name, function) tuples to get result in particular order with given names From c0151bdcd91282080277974939f18f6efd222d43 Mon Sep 17 00:00:00 2001 From: Chang She Date: Thu, 21 Jun 2012 16:45:39 -0400 Subject: [PATCH 07/42] DOC: example for plotting on secondary y-axis --- doc/source/visualization.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 34bd803516468..dcf0f3b87343f 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -91,6 +91,20 @@ You may pass ``logy`` to get a log-scale Y axis. @savefig series_plot_logy.png width=4.5in ts.plot(logy=True) +Plotting on a Secondary Y-axis +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To plot data on a secondary y-axis, use the ``secondary_y`` keyword: + +.. ipython:: python + + plt.figure() + + df.A.plot() + + @savefig series_plot_secondary_y.png width=4.5in + df.B.plot(secondary_y=True, style='g') + Targeting different subplots ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -107,6 +121,7 @@ You can pass an ``ax`` argument to ``Series.plot`` to plot on a particular axis: @savefig series_plot_multi.png width=4.5in df['D'].plot(ax=axes[1,1]); axes[1,1].set_title('D') + Other plotting features ----------------------- From 39687071b8627cf9fb86be39bf6bdcf5bd0c9a90 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 21 Jun 2012 18:26:07 -0400 Subject: [PATCH 08/42] RLS: 0.8.0 what's new docs --- RELEASE.rst | 2 +- doc/source/{whatsnew => }/v0.4.x.txt | 0 doc/source/{whatsnew => }/v0.5.0.txt | 0 doc/source/{whatsnew => }/v0.6.0.txt | 0 doc/source/{whatsnew => }/v0.6.1.txt | 0 doc/source/{whatsnew => }/v0.7.0.txt | 0 doc/source/{whatsnew => }/v0.7.1.txt | 0 doc/source/{whatsnew => }/v0.7.2.txt | 0 doc/source/{whatsnew => }/v0.7.3.txt | 0 doc/source/{whatsnew => }/v0.8.0.txt | 60 +++++++++++++++++++++++----- doc/source/visualization.rst | 1 + doc/source/whatsnew.rst | 18 ++++----- pandas/tseries/frequencies.py | 41 +++++++++---------- 13 files changed, 79 insertions(+), 43 deletions(-) rename doc/source/{whatsnew => }/v0.4.x.txt (100%) rename doc/source/{whatsnew => }/v0.5.0.txt (100%) rename doc/source/{whatsnew => }/v0.6.0.txt (100%) rename doc/source/{whatsnew => }/v0.6.1.txt (100%) rename doc/source/{whatsnew => }/v0.7.0.txt (100%) rename doc/source/{whatsnew => }/v0.7.1.txt (100%) rename doc/source/{whatsnew => }/v0.7.2.txt (100%) rename doc/source/{whatsnew => }/v0.7.3.txt (100%) rename doc/source/{whatsnew => }/v0.8.0.txt (86%) diff --git a/RELEASE.rst b/RELEASE.rst index d187b211e3f2f..120f7fca03ce6 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -43,7 +43,7 @@ pandas 0.8.0 conversion method (#1018) - Implement robust frequency inference function and `inferred_freq` attribute on DatetimeIndex (#391) - - New ``tz_convert`` methods in Series / DataFrame + - New ``tz_convert`` and ``tz_localize`` methods in Series / DataFrame - Convert DatetimeIndexes to UTC if time zones are different in join/setops (#864) - Add limit argument for forward/backward filling to reindex, fillna, diff --git a/doc/source/whatsnew/v0.4.x.txt b/doc/source/v0.4.x.txt similarity index 100% rename from doc/source/whatsnew/v0.4.x.txt rename to doc/source/v0.4.x.txt diff --git a/doc/source/whatsnew/v0.5.0.txt b/doc/source/v0.5.0.txt similarity index 100% rename from doc/source/whatsnew/v0.5.0.txt rename to doc/source/v0.5.0.txt diff --git a/doc/source/whatsnew/v0.6.0.txt b/doc/source/v0.6.0.txt similarity index 100% rename from doc/source/whatsnew/v0.6.0.txt rename to doc/source/v0.6.0.txt diff --git a/doc/source/whatsnew/v0.6.1.txt b/doc/source/v0.6.1.txt similarity index 100% rename from doc/source/whatsnew/v0.6.1.txt rename to doc/source/v0.6.1.txt diff --git a/doc/source/whatsnew/v0.7.0.txt b/doc/source/v0.7.0.txt similarity index 100% rename from doc/source/whatsnew/v0.7.0.txt rename to doc/source/v0.7.0.txt diff --git a/doc/source/whatsnew/v0.7.1.txt b/doc/source/v0.7.1.txt similarity index 100% rename from doc/source/whatsnew/v0.7.1.txt rename to doc/source/v0.7.1.txt diff --git a/doc/source/whatsnew/v0.7.2.txt b/doc/source/v0.7.2.txt similarity index 100% rename from doc/source/whatsnew/v0.7.2.txt rename to doc/source/v0.7.2.txt diff --git a/doc/source/whatsnew/v0.7.3.txt b/doc/source/v0.7.3.txt similarity index 100% rename from doc/source/whatsnew/v0.7.3.txt rename to doc/source/v0.7.3.txt diff --git a/doc/source/whatsnew/v0.8.0.txt b/doc/source/v0.8.0.txt similarity index 86% rename from doc/source/whatsnew/v0.8.0.txt rename to doc/source/v0.8.0.txt index cbf094752d4a9..c4049c208029e 100644 --- a/doc/source/whatsnew/v0.8.0.txt +++ b/doc/source/v0.8.0.txt @@ -67,15 +67,16 @@ Time series changes and improvements PeriodIndex and DatetimeIndex - New Timestamp data type subclasses `datetime.datetime`, providing the same interface while enabling working with nanosecond-resolution data. Also - provides **easy time zone conversions** -- Enhanced support for **time zones**. Add `tz_convert` methods to TimeSeries - and DataFrame. All timestamps are stored as UTC; Timestamps from - DatetimeIndex objects with time zone set will be localized to localtime. Time - zone conversions are therefore essentially free. User needs to know very - little about pytz library now; only time zone names as as strings are - required. Timestamps are equal if and only if their UTC timestamps - match. Operations between time series with different time zones will result - in a UTC-indexed time series + provides :ref:`easy time zone conversions `. +- Enhanced support for :ref:`time zones `. Add + `tz_convert` and ``tz_lcoalize`` methods to TimeSeries and DataFrame. All + timestamps are stored as UTC; Timestamps from DatetimeIndex objects with time + zone set will be localized to localtime. Time zone conversions are therefore + essentially free. User needs to know very little about pytz library now; only + time zone names as as strings are required. Time zone-aware timestamps are + equal if and only if their UTC timestamps match. Operations between time + zone-aware time series with different time zones will result in a UTC-indexed + time series. - Time series **string indexing conveniences** / shortcuts: slice years, year and month, and index values with strings - Enhanced time series **plotting**; adaptation of scikits.timeseries @@ -115,6 +116,7 @@ Other new features function) for computing a categorical variable from a continuous variable by binning values either into value-based (``cut``) or quantile-based (``qcut``) bins +- Rename ``Factor`` to ``Categorical`` and add a number of usability features - Add :ref:`limit ` argument to fillna/reindex - More flexible multiple function application in GroupBy, and can pass list (name, function) tuples to get result in particular order with given names @@ -135,8 +137,8 @@ Other new features memory usage than Python's dict - Add first, last, min, max, and prod optimized GroupBy functions - New :ref:`ordered_merge ` function -- Add flexible :ref:`comparison ` instance methods eq, ne, lt, gt, etc. to DataFrame, - Series +- Add flexible :ref:`comparison ` instance methods eq, ne, lt, + gt, etc. to DataFrame, Series - Improve :ref:`scatter_matrix ` plotting function and add histogram or kernel density estimates to diagonal - Add :ref:`'kde' ` plot option for density plots @@ -148,6 +150,42 @@ Other new features - Can select multiple columns from GroupBy - Add :ref:`update ` methods to Series/DataFrame for updating values in place +- Add ``any`` and ``all method to DataFrame + +New plotting methods +~~~~~~~~~~~~~~~~~~~~ + +.. ipython:: python + :suppress: + + import pandas as pd + fx = pd.load('data/fx_prices') + import matplotlib.pyplot as plt + +``Series.plot`` now supports a ``secondary_y`` option: + +.. ipython:: python + + plt.figure() + + fx['FR'].plot(style='g') + + @savefig whatsnew_secondary_y.png width=4.5in + fx['IT'].plot(style='k--', secondary_y=True) + +Vytautas Jancauskas, the 2012 GSOC participant, has added many new plot +types. For example, ``'kde'`` is a new option: + +.. ipython:: python + + s = Series(np.concatenate((np.random.randn(1000), + np.random.randn(1000) * 0.5 + 3))) + plt.figure() + s.hist(normed=True, alpha=0.2) + @savefig whatsnew_kde.png width=4.5in + s.plot(kind='kde') + +See :ref:`the plotting page ` for much more. Other API changes ~~~~~~~~~~~~~~~~~ diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index dcf0f3b87343f..eb15d7f77252d 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -121,6 +121,7 @@ You can pass an ``ax`` argument to ``Series.plot`` to plot on a particular axis: @savefig series_plot_multi.png width=4.5in df['D'].plot(ax=axes[1,1]); axes[1,1].set_title('D') +.. _visualization.other: Other plotting features ----------------------- diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index b930bdbbde1b1..e60baa37820c9 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -16,21 +16,21 @@ What's New These are new features and improvements of note in each release. -.. include:: whatsnew/v0.8.0.txt +.. include:: v0.8.0.txt -.. include:: whatsnew/v0.7.3.txt +.. include:: v0.7.3.txt -.. include:: whatsnew/v0.7.2.txt +.. include:: v0.7.2.txt -.. include:: whatsnew/v0.7.1.txt +.. include:: v0.7.1.txt -.. include:: whatsnew/v0.7.0.txt +.. include:: v0.7.0.txt -.. include:: whatsnew/v0.6.1.txt +.. include:: v0.6.1.txt -.. include:: whatsnew/v0.6.0.txt +.. include:: v0.6.0.txt -.. include:: whatsnew/v0.5.0.txt +.. include:: v0.5.0.txt -.. include:: whatsnew/v0.4.x.txt +.. include:: v0.4.x.txt diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 433e88e2d7ecb..98f9fbe3c5b6c 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -259,23 +259,20 @@ def _get_freq_str(base, mult=1): } need_suffix = ['QS', 'BQ', 'BQS', 'AS', 'BA', 'BAS'] -months = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', - 'OCT', 'NOV', 'DEC'] -for prefix in need_suffix: - for m in months: - _offset_to_period_map['%s-%s' % (prefix, m)] = \ - _offset_to_period_map[prefix] - -months = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', - 'OCT', 'NOV', 'DEC'] -for prefix in ['A', 'Q']: - for m in months: - alias = '%s-%s' % (prefix, m) - _offset_to_period_map[alias] = alias +_months = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', + 'OCT', 'NOV', 'DEC'] +for __prefix in need_suffix: + for _m in _months: + _offset_to_period_map['%s-%s' % (__prefix, _m)] = \ + _offset_to_period_map[__prefix] +for __prefix in ['A', 'Q']: + for _m in _months: + _alias = '%s-%s' % (__prefix, _m) + _offset_to_period_map[_alias] = _alias _days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] -for d in _days: - _offset_to_period_map['W-%s' % d] = 'W-%s' % d +for _d in _days: + _offset_to_period_map['W-%s' % _d] = 'W-%s' % _d def get_period_alias(offset_str): """ alias to closest period strings BQ->Q etc""" @@ -322,11 +319,11 @@ def get_period_alias(offset_str): 'us': 'U' } -for i, weekday in enumerate(['MON', 'TUE', 'WED', 'THU', 'FRI']): - for iweek in xrange(4): - name = 'WOM-%d%s' % (iweek + 1, weekday) - _offset_map[name] = offsets.WeekOfMonth(week=iweek, weekday=i) - _rule_aliases[name.replace('-', '@')] = name +for _i, _weekday in enumerate(['MON', 'TUE', 'WED', 'THU', 'FRI']): + for _iweek in xrange(4): + _name = 'WOM-%d%s' % (_iweek + 1, _weekday) + _offset_map[_name] = offsets.WeekOfMonth(week=_iweek, weekday=_i) + _rule_aliases[_name.replace('-', '@')] = _name _legacy_reverse_map = dict((v, k) for k, v in _rule_aliases.iteritems()) @@ -562,8 +559,8 @@ def get_standard_freq(freq): } _reverse_period_code_map = {} -for k, v in _period_code_map.iteritems(): - _reverse_period_code_map[v] = k +for _k, _v in _period_code_map.iteritems(): + _reverse_period_code_map[_v] = _k # Additional aliases _period_code_map.update({ From a6f553424b1aa8c433d6727d1622594ffe68ed26 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 21 Jun 2012 18:57:18 -0400 Subject: [PATCH 09/42] BUG: DataFrame constructor regression with dict of tuples close #1491 --- pandas/core/frame.py | 2 +- pandas/tests/test_frame.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b071bca8cf5de..139f0bfb94c20 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4604,7 +4604,7 @@ def extract_index(data): elif isinstance(v, dict): have_dicts = True indexes.append(v.keys()) - elif isinstance(v, (list, np.ndarray)): + elif isinstance(v, (list, tuple, np.ndarray)): have_raw_arrays = True raw_lengths.append(len(v)) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 6c3cc89f18f1e..58b7a8c847735 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -1579,6 +1579,14 @@ def test_constructor_dict_dont_upcast(self): dm = DataFrame([[1,2],['a','b']], index=[1,2], columns=[1,2]) self.assert_(isinstance(dm[1][1], int)) + def test_constructor_dict_of_tuples(self): + # GH #1491 + data = {'a': (1, 2, 3), 'b': (4, 5, 6)} + + result = DataFrame(data) + expected = DataFrame(dict((k, list(v)) for k, v in data.iteritems())) + assert_frame_equal(result, expected) + def test_constructor_ndarray(self): mat = np.zeros((2, 3), dtype=float) From 2c511445b27ca61dfb2a4e9ca36901c36cb6a891 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 21 Jun 2012 19:02:52 -0400 Subject: [PATCH 10/42] BUG: don't accidentally upcast values in DataFrame.itertuples, close #1505 --- pandas/core/frame.py | 8 ++++++-- pandas/tests/test_frame.py | 6 ++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 139f0bfb94c20..e84d51bbe92ab 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -615,12 +615,16 @@ def iterrows(self): s.name = k yield k, s - def itertuples(self): + def itertuples(self, index=True): """ Iterate over rows of DataFrame as tuples, with index value as first element of the tuple """ - return izip(self.index, *self.values.T) + arrays = [] + if index: + arrays.append(self.index) + arrays.extend(self[k] for k in self.columns) + return izip(*arrays) iterkv = iteritems if py3compat.PY3: # pragma: no cover diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 58b7a8c847735..1906c3936292d 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -2544,6 +2544,12 @@ def test_itertuples(self): expected = self.frame.ix[i,:].reset_index(drop=True) assert_series_equal(s, expected) + df = DataFrame({'floats': np.random.randn(5), + 'ints': range(5)}, columns=['floats', 'ints']) + + for tup in df.itertuples(index=False): + self.assert_(isinstance(tup[1], np.integer)) + def test_len(self): self.assertEqual(len(self.frame), len(self.frame.index)) From 87d6da19b68c459573a15c911f1a827705767fe5 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 21 Jun 2012 19:11:05 -0400 Subject: [PATCH 11/42] BUG: revert to circle marker for markers not in matplotlib < 1.1. close #1484 --- pandas/tools/plotting.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 9c9165e61ff9a..6aae74571182f 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -50,6 +50,8 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, mask = com.notnull(df) + marker = _get_marker_compat(marker) + for i, a in zip(range(n), df.columns): for j, b in zip(range(n), df.columns): if i == j: @@ -130,6 +132,12 @@ def _gcf(): import matplotlib.pyplot as plt return plt.gcf() +def _get_marker_compat(marker): + import matplotlib.lines as mlines + if marker not in mlines.lineMarkers: + return 'o' + return marker + def andrews_curves(data, class_column, ax=None, samples=200): """ Parameters: From 9d2d4bfb9bcb7a3003ab423aa17e922e6b0cf29d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 22 Jun 2012 10:29:09 -0400 Subject: [PATCH 12/42] BUG: clean NAs in DataFrame.boxplot, close #1506 --- pandas/tools/plotting.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 6aae74571182f..485d568f91353 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -7,7 +7,7 @@ from pandas.util.decorators import cache_readonly import pandas.core.common as com from pandas.core.index import Index, MultiIndex -from pandas.core.series import Series +from pandas.core.series import Series, remove_na from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex from pandas.tseries.frequencies import get_period_alias, get_base_alias @@ -1007,7 +1007,7 @@ def boxplot(data, column=None, by=None, ax=None, fontsize=None, def plot_group(grouped, ax): keys, values = zip(*grouped) keys = [_stringify(x) for x in keys] - ax.boxplot(values, **kwds) + ax.boxplot(remove_na(values), **kwds) if kwds.get('vert', 1): ax.set_xticklabels(keys, rotation=rot, fontsize=fontsize) else: @@ -1043,7 +1043,8 @@ def plot_group(grouped, ax): # Return boxplot dict in single plot case - bp = ax.boxplot(list(data[cols].values.T), **kwds) + clean_values = [remove_na(x) for x in data[cols].values.T] + bp = ax.boxplot(clean_values, **kwds) if kwds.get('vert', 1): ax.set_xticklabels(keys, rotation=rot, fontsize=fontsize) else: From 44a8295c8f708b40a9b1279cf5d741edacf9d58e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 22 Jun 2012 15:16:00 -0400 Subject: [PATCH 13/42] BUG: fix NumPy 1.7 cast-to-bool failure and buglet --- doc/source/timeseries.rst | 2 +- pandas/core/frame.py | 6 +++--- pandas/core/index.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index c616a07670e02..ac62bae5071cf 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -297,7 +297,7 @@ We could have done the same thing with ``DateOffset``: .. ipython:: python - from pandas.core.datetools import * + from pandas.tseries.offsets import * d + DateOffset(months=4, days=5) The key features of a ``DateOffset`` object are: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e84d51bbe92ab..4d41c02ad4200 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4226,9 +4226,9 @@ def _reduce(self, op, axis=0, skipna=True, numeric_only=None, if result.dtype == np.object_: try: if filter_type is None or filter_type == 'numeric': - result = result.astype('f8') - elif filter_type == 'bool': - result = result.astype('b') + result = result.astype(np.float64) + elif filter_type == 'bool' and notnull(result).all(): + result = result.astype(np.bool_) else: raise ValueError('Invalid dtype %s ' % str(filter_type)) diff --git a/pandas/core/index.py b/pandas/core/index.py index db0bd1bd7147e..14edc86cc600c 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -405,7 +405,7 @@ def asof(self, label): def asof_locs(self, where, mask): """ where : array of timestamps - mask : array of booleans where data is NA + mask : array of booleans where data is not NA """ locs = self.values[mask].searchsorted(where.values, side='right') From 836ecab757b3b952e6599c8dcc6b6c761b27db7a Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 22 Jun 2012 15:26:56 -0400 Subject: [PATCH 14/42] BUG: preserve index names in MultiIndex.drop, close #1513 --- pandas/core/index.py | 3 ++- pandas/tests/test_multilevel.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index 14edc86cc600c..35096651b90ef 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -2295,7 +2295,8 @@ def delete(self, loc): new_index : MultiIndex """ new_labels = [np.delete(lab, loc) for lab in self.labels] - return MultiIndex(levels=self.levels, labels=new_labels) + return MultiIndex(levels=self.levels, labels=new_labels, + names=self.names) get_major_bounds = slice_locs diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index e6698ff52afaf..1189df2ac3ce6 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1355,6 +1355,16 @@ def test_drop_level(self): expected = self.frame.ix[[0, 2, 3, 6, 7, 9]].T assert_frame_equal(result, expected) + def test_drop_preserve_names(self): + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], + [1, 2, 3, 1, 2, 3]], + names=['one', 'two']) + + df = DataFrame(np.random.randn(6, 3), index=index) + + result = df.drop([(0, 2)]) + self.assert_(result.index.names == ['one', 'two']) + def test_unicode_repr_issues(self): levels = [Index([u'a/\u03c3', u'b/\u03c3',u'c/\u03c3']), Index([0, 1])] From 5f5df2aa34f027d3fc83abb19cf7840ababd4557 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 22 Jun 2012 15:34:42 -0400 Subject: [PATCH 15/42] RLS: 0.8.0 release candidate 1 --- RELEASE.rst | 3 ++- setup.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 120f7fca03ce6..bfb2ca515372a 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -25,7 +25,7 @@ Where to get it pandas 0.8.0 ============ -**Release date:** NOT YET RELEASED +**Release date:** 6/22/2012 **New features** @@ -203,6 +203,7 @@ pandas 0.8.0 - Fix unicode serialization issue in HDFStore (#1361) - Pass keywords to pyplot.boxplot in DataFrame.boxplot (#1493) - Bug fixes in MonthBegin (#1483) + - Preserve MultiIndex names in drop (#1513) pandas 0.7.3 ============ diff --git a/setup.py b/setup.py index ba80a464060dc..615d3a0b4ff80 100755 --- a/setup.py +++ b/setup.py @@ -174,9 +174,9 @@ MAJOR = 0 MINOR = 8 MICRO = 0 -ISRELEASED = False +ISRELEASED = True VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO) -QUALIFIER = 'b2' +QUALIFIER = 'rc1' FULLVERSION = VERSION if not ISRELEASED: From 5d90f7d0e7e9bb53e55fdb7b70f972aa502004ed Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 22 Jun 2012 15:44:31 -0400 Subject: [PATCH 16/42] BUG: fix failure in grouped boxplot causing docs bug. push rc number to 2 --- pandas/tests/test_graphics.py | 3 +++ pandas/tools/plotting.py | 3 ++- setup.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 57bc17c5c33b6..2c386076b69b7 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -194,6 +194,9 @@ def test_boxplot(self): _check_plot_works(df.boxplot, notch=1) _check_plot_works(df.boxplot, by='indic', notch=1) + df = DataFrame(np.random.rand(10,2), columns=['Col1', 'Col2'] ) + df['X'] = Series(['A','A','A','A','A','B','B','B','B','B']) + _check_plot_works(df.boxplot, by='X') @slow def test_kde(self): diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 485d568f91353..97bea5cb7f7ec 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -1007,7 +1007,8 @@ def boxplot(data, column=None, by=None, ax=None, fontsize=None, def plot_group(grouped, ax): keys, values = zip(*grouped) keys = [_stringify(x) for x in keys] - ax.boxplot(remove_na(values), **kwds) + values = [remove_na(v) for v in values] + ax.boxplot(values, **kwds) if kwds.get('vert', 1): ax.set_xticklabels(keys, rotation=rot, fontsize=fontsize) else: diff --git a/setup.py b/setup.py index 615d3a0b4ff80..2c62f7051bfce 100755 --- a/setup.py +++ b/setup.py @@ -176,7 +176,7 @@ MICRO = 0 ISRELEASED = True VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO) -QUALIFIER = 'rc1' +QUALIFIER = 'rc2' FULLVERSION = VERSION if not ISRELEASED: From 896140435e9cfbd8e893668c92b2e5d9b9389276 Mon Sep 17 00:00:00 2001 From: Chang She Date: Sun, 24 Jun 2012 20:47:55 -0400 Subject: [PATCH 17/42] ENH: freqstr with offset #1184 --- pandas/tseries/offsets.py | 54 ++++++++++++++++++++++++++-- pandas/tseries/tests/test_offsets.py | 7 ++++ 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index b03d8febbfbc2..8f62c3304cb53 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -221,10 +221,11 @@ def freqstr(self): return repr(self) if self.n != 1: - return '%d%s' % (self.n, code) + fstr = '%d%s' % (self.n, code) else: - return code + fstr = code + return fstr class BusinessDay(CacheableOffset, DateOffset): """ @@ -261,6 +262,55 @@ def __repr__(self): out += '>' return out + @property + def freqstr(self): + try: + code = self.rule_code + except NotImplementedError: + return repr(self) + + if self.n != 1: + fstr = '%d%s' % (self.n, code) + else: + fstr = code + + if self.offset: + fstr += self._offset_str() + + return fstr + + def _offset_str(self): + + def get_str(td): + off_str = '' + if td.days > 0: + off_str += str(td.days) + 'D' + if td.seconds > 0: + s = td.seconds + hrs = int(s / 3600) + if hrs != 0: + off_str += str(hrs) + 'H' + s -= hrs * 3600 + mts = int(s / 60) + if mts != 0: + off_str += str(mts) + 'Min' + s -= mts * 60 + if s != 0: + off_str += str(s) + 's' + if td.microseconds > 0: + off_str += str(td.microseconds) + 'us' + return off_str + + if isinstance(self.offset, timedelta): + tot_sec = self.offset.total_seconds() + if tot_sec > 0: + off_str = '+' + get_str(self.offset) + if tot_sec < 0: + off_str = '-' + get_str(-self.offset) + return off_str + else: + return '+' + repr(self.offset) + def isAnchored(self): return (self.n == 1) diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py index 05f638afaebed..1311b9ca451a2 100644 --- a/pandas/tseries/tests/test_offsets.py +++ b/pandas/tseries/tests/test_offsets.py @@ -1389,6 +1389,13 @@ def test_dateoffset_misc(): assert(not offsets.DateOffset(months=2) == 2) +def test_freq_offsets(): + off = BDay(1, offset=timedelta(0, 1800)) + assert(off.freqstr == 'B+30Min') + + off = BDay(1, offset=timedelta(0, -1800)) + assert(off.freqstr == 'B-30Min') + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], From d90a8290ffbdb3e18b44a7b7db56b6e6408704ba Mon Sep 17 00:00:00 2001 From: Chang She Date: Sun, 24 Jun 2012 19:10:13 -0400 Subject: [PATCH 18/42] ENH: option to disable empty string conversion to NaN in file readers --- pandas/io/parsers.py | 4 ++-- pandas/io/tests/test_parsers.py | 26 ++++++++++++++++++++++++++ pandas/src/inference.pyx | 15 ++++++++++----- 3 files changed, 38 insertions(+), 7 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8261b889b4500..3b2fedd65b45f 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1029,9 +1029,9 @@ def _convert_types(values, na_values): return values, na_count try: - result = lib.maybe_convert_numeric(values, na_values) + result = lib.maybe_convert_numeric(values, na_values, False) except Exception: - na_count = lib.sanitize_objects(values, na_values) + na_count = lib.sanitize_objects(values, na_values, False) result = values if result.dtype == np.object_: diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 7a75e687a4a8a..1f14df528af8f 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -52,6 +52,32 @@ def setUp(self): self.csv2 = os.path.join(self.dirpath, 'test2.csv') self.xls1 = os.path.join(self.dirpath, 'test.xls') + def test_empty_string(self): + data = """\ +One,Two,Three +a,1,one +b,2,two +,3,three +d,4,nan +e,5,five +nan,6, +g,7,seven +""" + df = read_csv(StringIO(data)) + xp = DataFrame({'One' : ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'], + 'Two' : [1,2,3,4,5,6,7], + 'Three' : ['one', 'two', 'three', np.nan, 'five', + np.nan, 'seven']}) + assert_frame_equal(xp.reindex(columns=df.columns), df) + + df = read_csv(StringIO(data), na_values={'One': [], 'Three': []}) + xp = DataFrame({'One' : ['a', 'b', '', 'd', 'e', 'nan', 'g'], + 'Two' : [1,2,3,4,5,6,7], + 'Three' : ['one', 'two', 'three', 'nan', 'five', + '', 'seven']}) + assert_frame_equal(xp.reindex(columns=df.columns), df) + + def test_read_csv(self): pass diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 4057d240fceb6..451b998b0481b 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -241,7 +241,8 @@ def is_date_array(ndarray[object] values): return True -def maybe_convert_numeric(ndarray[object] values, set na_values): +def maybe_convert_numeric(ndarray[object] values, set na_values, + convert_empty=True): ''' Type inference function-- convert strings to numeric (potentially) and convert to proper dtype array @@ -275,8 +276,11 @@ def maybe_convert_numeric(ndarray[object] values, set na_values): floats[i] = complexes[i] = nan seen_float = 1 elif len(val) == 0: - floats[i] = complexes[i] = nan - seen_float = 1 + if convert_empty: + floats[i] = complexes[i] = nan + seen_float = 1 + else: + raise ValueError('Empty string encountered') elif util.is_complex_object(val): complexes[i] = val seen_complex = 1 @@ -573,7 +577,8 @@ def try_parse_datetime_components(ndarray[object] years, ndarray[object] months, return result -def sanitize_objects(ndarray[object] values, set na_values): +def sanitize_objects(ndarray[object] values, set na_values, + convert_empty=True): cdef: Py_ssize_t i, n object val, onan @@ -585,7 +590,7 @@ def sanitize_objects(ndarray[object] values, set na_values): for i from 0 <= i < n: val = values[i] - if val == '' or val in na_values: + if (convert_empty and val == '') or (val in na_values): values[i] = onan na_count += 1 elif val in memo: From bf84631de8b37f53162695a7272075d9234a6d39 Mon Sep 17 00:00:00 2001 From: Chang She Date: Sun, 24 Jun 2012 16:47:54 -0400 Subject: [PATCH 19/42] BUG: HDFStore converting all to datetime64 #1520 --- pandas/io/pytables.py | 4 ++-- pandas/io/tests/test_pytables.py | 12 ++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 52af5eff72336..4ec176556eaa8 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -716,7 +716,7 @@ def _write_array(self, group, key, value): vlarr.append(value) elif value.dtype.type == np.datetime64: self.handle.createArray(group, key, value.view('i8')) - group._v_attrs.value_type = 'datetime64' + getattr(group, key)._v_attrs.value_type = 'datetime64' else: self.handle.createArray(group, key, value) @@ -958,7 +958,7 @@ def _read_array(group, key): if isinstance(node, tables.VLArray): return data[0] else: - dtype = getattr(group._v_attrs, 'value_type', None) + dtype = getattr(node._v_attrs, 'value_type', None) if dtype == 'datetime64': return np.array(data, dtype='M8[ns]') return data diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 8576b3d69ad99..cddd8e53fe55b 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -660,6 +660,18 @@ def test_unicode_index(self): s = Series(np.random.randn(len(unicode_values)), unicode_values) self._check_roundtrip(s, tm.assert_series_equal) + def test_store_datetime_mixed(self): + df = DataFrame({'a': [1,2,3], 'b': [1.,2.,3.], 'c': ['a', 'b', 'c']}) + ts = tm.makeTimeSeries() + df['d'] = ts.index[:3] + store = HDFStore('data.h5') + store['df'] = df + store.close() + store = HDFStore('data.h5') + test = store['df'] + assert_series_equal(df.dtypes, test.dtypes) + store.close() + os.remove('data.h5') def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) From e2091854057028403c3f004787643ce1ed3ec5a3 Mon Sep 17 00:00:00 2001 From: Chang She Date: Sun, 24 Jun 2012 17:18:14 -0400 Subject: [PATCH 20/42] TST: use _check_roundtrip --- pandas/io/tests/test_pytables.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index cddd8e53fe55b..87a6329ef9368 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -664,14 +664,7 @@ def test_store_datetime_mixed(self): df = DataFrame({'a': [1,2,3], 'b': [1.,2.,3.], 'c': ['a', 'b', 'c']}) ts = tm.makeTimeSeries() df['d'] = ts.index[:3] - store = HDFStore('data.h5') - store['df'] = df - store.close() - store = HDFStore('data.h5') - test = store['df'] - assert_series_equal(df.dtypes, test.dtypes) - store.close() - os.remove('data.h5') + self._check_roundtrip(df, tm.assert_frame_equal) def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) From 3027871161409f84ecac029231cb444dd6a42c69 Mon Sep 17 00:00:00 2001 From: Chang She Date: Fri, 22 Jun 2012 20:54:34 -0400 Subject: [PATCH 21/42] ENH: retain Series names when constructing DataFrame from list of Series #1494 --- pandas/core/frame.py | 19 +++++++++++++++++++ pandas/tests/test_frame.py | 22 ++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4d41c02ad4200..b51fab43c9e33 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -389,6 +389,9 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, copy=copy) elif isinstance(data, list): if len(data) > 0: + if index is None and isinstance(data[0], Series): + index = _get_names_from_index(data) + if isinstance(data[0], (list, tuple, dict, Series)): conv_data, columns = _to_sdict(data, columns) if isinstance(conv_data, dict): @@ -4758,6 +4761,22 @@ def _convert_object_array(content, columns, coerce_float=False): for c, vals in zip(columns, content)) return sdict, columns +def _get_names_from_index(data): + index = range(len(data)) + has_some_name = any([s.name is not None for s in data]) + if not has_some_name: + return index + + count = 0 + for i, s in enumerate(data): + n = s.name + if n is not None: + index[i] = n + else: + index[i] = 'Unnamed %d' % count + count += 1 + + return index def _homogenize(data, index, columns, dtype=None): from pandas.core.series import _sanitize_array diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 1906c3936292d..15d465e8df966 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -1871,6 +1871,28 @@ def test_constructor_list_of_dicts(self): assert_frame_equal(result, expected) def test_constructor_list_of_series(self): + data = [{'a': 1.5, 'b': 3.0, 'c':4.0}, + {'a': 1.5, 'b': 3.0, 'c':6.0}] + sdict = dict(zip(['x', 'y'], data)) + idx = Index(['a', 'b', 'c']) + + # all named + data2 = [Series([1.5, 3, 4], idx, dtype='O', name='x'), + Series([1.5, 3, 6], idx, name='y')] + result = DataFrame(data2) + expected = DataFrame.from_dict(sdict, orient='index') + assert_frame_equal(result, expected) + + # some unnamed + data2 = [Series([1.5, 3, 4], idx, dtype='O', name='x'), + Series([1.5, 3, 6], idx)] + result = DataFrame(data2) + + sdict = dict(zip(['x', 'Unnamed 0'], data)) + expected = DataFrame.from_dict(sdict, orient='index') + assert_frame_equal(result.sort_index(), expected) + + # none named data = [{'a': 1.5, 'b': 3, 'c':4, 'd':6}, {'a': 1.5, 'b': 3, 'd':6}, {'a': 1.5, 'd':6}, From c903094bd5ed3a6ce8509e001f5fc69b500ed220 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 25 Jun 2012 10:13:59 -0400 Subject: [PATCH 22/42] DOC: release notes --- RELEASE.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/RELEASE.rst b/RELEASE.rst index bfb2ca515372a..79b980694860a 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -25,7 +25,7 @@ Where to get it pandas 0.8.0 ============ -**Release date:** 6/22/2012 +**Release date:** 6/26/2012 **New features** @@ -131,6 +131,7 @@ pandas 0.8.0 - Add ``retries`` argument to ``get_data_yahoo`` to try to prevent Yahoo! API 404s (#826) - Improve performance of reshaping by using O(N) categorical sorting + - Series names will be used for index of DataFrame if no index passed (#1494) **API Changes** From 66690f276619874061a10b0844f4df93292c2d18 Mon Sep 17 00:00:00 2001 From: Chang She Date: Sun, 24 Jun 2012 00:02:03 -0400 Subject: [PATCH 23/42] BUG/TST: plot irregular and reg freq on same subplot --- pandas/tools/plotting.py | 13 +++++++++--- pandas/tseries/plotting.py | 30 ++++++++++++++++++--------- pandas/tseries/tests/test_plotting.py | 28 ++++++++++++++++++++++++- 3 files changed, 57 insertions(+), 14 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 97bea5cb7f7ec..8cf584155d1c8 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -596,10 +596,13 @@ def __init__(self, data, **kwargs): @property def has_ts_index(self): + # TODO refactor this whole regular/irregular kludge from pandas.core.frame import DataFrame if isinstance(self.data, (Series, DataFrame)): + ax, _ = self._get_ax_and_style(0) freq = (getattr(self.data.index, 'freq', None) - or getattr(self.data.index, 'inferred_freq', None)) + or getattr(self.data.index, 'inferred_freq', None) + or getattr(ax, 'freq', None)) return (freq is not None) and self._has_dynamic_index_freq(freq) return False @@ -649,8 +652,12 @@ def _maybe_convert_index(self, data): freq = get_period_alias(freq) - if freq is None and hasattr(data.index, 'inferred_freq'): - freq = data.index.inferred_freq + if freq is None: + freq = getattr(data.index, 'inferred_freq', None) + + if freq is None: + ax, _ = self._get_ax_and_style(0) + freq = getattr(ax, 'freq', None) if isinstance(freq, DateOffset): freq = freq.rule_code diff --git a/pandas/tseries/plotting.py b/pandas/tseries/plotting.py index 79823e8c12dca..7f4b4e5130c66 100644 --- a/pandas/tseries/plotting.py +++ b/pandas/tseries/plotting.py @@ -44,8 +44,17 @@ def tsplot(series, plotf, **kwargs): """ # Used inferred freq is possible, need a test case for inferred freq = getattr(series.index, 'freq', None) - if freq is None and hasattr(series.index, 'inferred_freq'): - freq = series.index.inferred_freq + if freq is None: + freq = getattr(series.index, 'inferred_freq', None) + + if 'ax' in kwargs: + ax = kwargs.pop('ax') + else: + import matplotlib.pyplot as plt + ax = plt.gca() + + if freq is None: + freq = getattr(ax, 'freq', None) if isinstance(freq, DateOffset): freq = freq.rule_code @@ -62,12 +71,6 @@ def tsplot(series, plotf, **kwargs): style = kwargs.pop('style', None) - if 'ax' in kwargs: - ax = kwargs.pop('ax') - else: - import matplotlib.pyplot as plt - ax = plt.gca() - # Specialized ts plotting attributes for Axes ax.freq = freq xaxis = ax.get_xaxis() @@ -93,12 +96,19 @@ def tsplot(series, plotf, **kwargs): format_dateaxis(ax, ax.freq) - left = series.index[0] #get_datevalue(series.index[0], freq) - right = series.index[-1] #get_datevalue(series.index[-1], freq) + left, right = _get_xlim(ax.get_lines()) ax.set_xlim(left, right) return ax +def _get_xlim(lines): + left, right = np.inf, -np.inf + for l in lines: + x = l.get_xdata() + left = min(x[0].ordinal, left) + right = max(x[-1].ordinal, right) + return left, right + def get_datevalue(date, freq): if isinstance(date, Period): return date.asfreq(freq).ordinal diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tseries/tests/test_plotting.py index 81f972debe6c2..9331d79efe15a 100644 --- a/pandas/tseries/tests/test_plotting.py +++ b/pandas/tseries/tests/test_plotting.py @@ -234,8 +234,10 @@ def test_finder_daily(self): @slow def test_finder_quarterly(self): + import matplotlib.pyplot as plt xp = Period('1988Q1').ordinal yrs = [3.5, 11] + plt.close('all') for n in yrs: rng = period_range('1987Q2', periods=int(n * 4), freq='Q') ser = Series(np.random.randn(len(rng)), rng) @@ -250,8 +252,10 @@ def test_finder_quarterly(self): @slow def test_finder_monthly(self): + import matplotlib.pyplot as plt xp = Period('1988-1').ordinal yrs = [1.15, 2.5, 4, 11] + plt.close('all') for n in yrs: rng = period_range('1987Q2', periods=int(n * 12), freq='M') ser = Series(np.random.randn(len(rng)), rng) @@ -263,8 +267,12 @@ def test_finder_monthly(self): ax.set_xlim(vmin + 0.9, vmax) rs = xaxis.get_majorticklocs()[0] self.assertEqual(xp, rs) + plt.close('all') - + @slow + def test_finder_monthly_long(self): + import matplotlib.pyplot as plt + plt.close('all') rng = period_range('1988Q1', periods=24*12, freq='M') ser = Series(np.random.randn(len(rng)), rng) ax = ser.plot() @@ -433,6 +441,24 @@ def test_secondary_frame(self): self.assert_(axes[1].get_yaxis().get_ticks_position() == 'default') self.assert_(axes[2].get_yaxis().get_ticks_position() == 'right') + @slow + def test_mixed_freq(self): + import matplotlib.pyplot as plt + plt.close('all') + s1 = tm.makeTimeSeries() + s2 = s1[[0, 5, 10, 11, 12, 13, 14, 15]] + s1.plot() + ax2 = s2.plot(style='g') + lines = ax2.get_lines() + idx1 = lines[0].get_xdata() + idx2 = lines[1].get_xdata() + self.assert_(idx1.equals(s1.index.to_period('B'))) + self.assert_(idx2.equals(s2.index.to_period('B'))) + left, right = ax2.get_xlim() + pidx = s1.index.to_period() + self.assert_(left == pidx[0].ordinal) + self.assert_(right == pidx[-1].ordinal) + PNG_PATH = 'tmp.png' def _check_plot_works(f, freq=None, series=None, *args, **kwargs): import matplotlib.pyplot as plt From 0940b78a0568020ca08ef02131f57e3b6285cf3d Mon Sep 17 00:00:00 2001 From: Chang She Date: Sun, 24 Jun 2012 14:36:31 -0400 Subject: [PATCH 24/42] BUG: plot mixed frequencies #1517 --- pandas/tools/plotting.py | 44 ++++++++++------ pandas/tseries/plotting.py | 66 +++++++++++++++-------- pandas/tseries/tests/test_plotting.py | 75 ++++++++++++++++++++++++--- 3 files changed, 140 insertions(+), 45 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 8cf584155d1c8..30505cfc4d48c 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -485,13 +485,16 @@ def plt(self): _need_to_set_index = False - def _get_xticks(self): + def _get_xticks(self, convert_period=False): index = self.data.index is_datetype = index.inferred_type in ('datetime', 'date', 'datetime64') if self.use_index: - if index.is_numeric() or is_datetype: + if convert_period and isinstance(index, PeriodIndex): + index = index.to_timestamp() + x = index._mpl_repr() + elif index.is_numeric() or is_datetype: """ Matplotlib supports numeric values or datetime objects as xaxis values. Taking LBYL approach here, by the time @@ -594,19 +597,14 @@ class LinePlot(MPLPlot): def __init__(self, data, **kwargs): MPLPlot.__init__(self, data, **kwargs) - @property - def has_ts_index(self): - # TODO refactor this whole regular/irregular kludge + def _index_freq(self): from pandas.core.frame import DataFrame if isinstance(self.data, (Series, DataFrame)): - ax, _ = self._get_ax_and_style(0) freq = (getattr(self.data.index, 'freq', None) - or getattr(self.data.index, 'inferred_freq', None) - or getattr(ax, 'freq', None)) - return (freq is not None) and self._has_dynamic_index_freq(freq) - return False + or getattr(self.data.index, 'inferred_freq', None)) + return freq - def _has_dynamic_index_freq(self, freq): + def _is_dynamic_freq(self, freq): if isinstance(freq, DateOffset): freq = freq.rule_code else: @@ -614,13 +612,26 @@ def _has_dynamic_index_freq(self, freq): freq = get_period_alias(freq) return freq is not None + def _use_dynamic_x(self): + freq = self._index_freq() + + ax, _ = self._get_ax_and_style(0) + ax_freq = getattr(ax, 'freq', None) + if freq is None: # convert irregular if axes has freq info + freq = ax_freq + else: # do not use tsplot if irregular was plotted first + if (ax_freq is None) and (len(ax.get_lines()) > 0): + return False + + return (freq is not None) and self._is_dynamic_freq(freq) + def _make_plot(self): # this is slightly deceptive - if self.use_index and self.has_ts_index: + if self.use_index and self._use_dynamic_x(): data = self._maybe_convert_index(self.data) self._make_ts_plot(data) else: - x = self._get_xticks() + x = self._get_xticks(convert_period=True) plotf = self._get_plot_function() @@ -649,6 +660,8 @@ def _maybe_convert_index(self, data): if (isinstance(data.index, DatetimeIndex) and isinstance(data, DataFrame)): freq = getattr(data.index, 'freqstr', None) + if isinstance(freq, DateOffset): + freq = freq.rule_code freq = get_period_alias(freq) @@ -659,9 +672,6 @@ def _maybe_convert_index(self, data): ax, _ = self._get_ax_and_style(0) freq = getattr(ax, 'freq', None) - if isinstance(freq, DateOffset): - freq = freq.rule_code - data = DataFrame(data.values, index=data.index.to_period(freq=freq), columns=data.columns) @@ -699,7 +709,7 @@ def _post_plot_logic(self): else: self.axes[0].legend(loc='best') - condition = (not self.has_ts_index + condition = (not self._use_dynamic_x and df.index.is_all_dates and not self.subplots or (self.subplots and self.sharex)) diff --git a/pandas/tseries/plotting.py b/pandas/tseries/plotting.py index 7f4b4e5130c66..2e0506e6c5fc2 100644 --- a/pandas/tseries/plotting.py +++ b/pandas/tseries/plotting.py @@ -43,32 +43,30 @@ def tsplot(series, plotf, **kwargs): """ # Used inferred freq is possible, need a test case for inferred - freq = getattr(series.index, 'freq', None) - if freq is None: - freq = getattr(series.index, 'inferred_freq', None) - if 'ax' in kwargs: ax = kwargs.pop('ax') else: import matplotlib.pyplot as plt ax = plt.gca() - if freq is None: - freq = getattr(ax, 'freq', None) - - if isinstance(freq, DateOffset): - freq = freq.rule_code + freq = _get_freq(ax, series) + # resample against axes freq if necessary + if freq is None: # pragma: no cover + raise ValueError('Cannot use dynamic axis without frequency info') else: - freq = frequencies.get_base_alias(freq) + ax_freq = getattr(ax, 'freq', None) + if (ax_freq is not None) and (freq != ax_freq): + if frequencies.is_subperiod(freq, ax_freq): # downsample + how = kwargs.pop('how', 'first') + series = series.resample(ax_freq, how=how) + elif frequencies.is_superperiod(freq, ax_freq): + series = series.resample(ax_freq) + freq = ax_freq - freq = frequencies.get_period_alias(freq) # Convert DatetimeIndex to PeriodIndex if isinstance(series.index, DatetimeIndex): series = series.to_period(freq=freq) - if freq != series.index.freq: - series = series.asfreq(freq) - style = kwargs.pop('style', None) # Specialized ts plotting attributes for Axes @@ -81,13 +79,7 @@ def tsplot(series, plotf, **kwargs): ax.date_axis_info = None # format args and lot - mask = isnull(series) - if mask.any(): - masked_array = np.ma.array(series.values) - masked_array = np.ma.masked_where(mask, masked_array) - args = [series.index, masked_array] - else: - args = [series.index, series] + args = _maybe_mask(series) if style is not None: args.append(style) @@ -101,6 +93,38 @@ def tsplot(series, plotf, **kwargs): return ax +def _maybe_mask(series): + mask = isnull(series) + if mask.any(): + masked_array = np.ma.array(series.values) + masked_array = np.ma.masked_where(mask, masked_array) + args = [series.index, masked_array] + else: + args = [series.index, series] + return args + +def _get_freq(ax, series): + # get frequency from data + freq = getattr(series.index, 'freq', None) + if freq is None: + freq = getattr(series.index, 'inferred_freq', None) + + ax_freq = getattr(ax, 'freq', None) + + # use axes freq if no data freq + if freq is None: + freq = ax_freq + + # get the period frequency + if isinstance(freq, DateOffset): + freq = freq.rule_code + else: + freq = frequencies.get_base_alias(freq) + + freq = frequencies.get_period_alias(freq) + + return freq + def _get_xlim(lines): left, right = np.inf, -np.inf for l in lines: diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tseries/tests/test_plotting.py index 9331d79efe15a..e0a8acacf2df3 100644 --- a/pandas/tseries/tests/test_plotting.py +++ b/pandas/tseries/tests/test_plotting.py @@ -6,6 +6,7 @@ import numpy as np from numpy.testing.decorators import slow +from numpy.testing import assert_array_equal from pandas import Index, Series, DataFrame, isnull, notnull @@ -52,17 +53,22 @@ def setUp(self): def test_tsplot(self): from pandas.tseries.plotting import tsplot import matplotlib.pyplot as plt + plt.close('all') + ax = plt.gca() ts = tm.makeTimeSeries() plot_ax = tsplot(ts, plt.Axes.plot) self.assert_(plot_ax == ax) f = lambda *args, **kwds: tsplot(s, plt.Axes.plot, *args, **kwds) + plt.close('all') for s in self.period_ser: _check_plot_works(f, s.index.freq, ax=ax, series=s) + plt.close('all') for s in self.datetime_ser: _check_plot_works(f, s.index.freq.rule_code, ax=ax, series=s) + plt.close('all') plt.close('all') ax = ts.plot(style='k') @@ -442,7 +448,7 @@ def test_secondary_frame(self): self.assert_(axes[2].get_yaxis().get_ticks_position() == 'right') @slow - def test_mixed_freq(self): + def test_mixed_freq_regular_first(self): import matplotlib.pyplot as plt plt.close('all') s1 = tm.makeTimeSeries() @@ -458,6 +464,58 @@ def test_mixed_freq(self): pidx = s1.index.to_period() self.assert_(left == pidx[0].ordinal) self.assert_(right == pidx[-1].ordinal) + plt.close('all') + + @slow + def test_mixed_freq_irregular_first(self): + import matplotlib.pyplot as plt + plt.close('all') + s1 = tm.makeTimeSeries() + s2 = s1[[0, 5, 10, 11, 12, 13, 14, 15]] + s2.plot(style='g') + ax = s1.plot() + self.assert_(not hasattr(ax, 'freq')) + lines = ax.get_lines() + x1 = lines[0].get_xdata() + assert_array_equal(x1, s2.index.asobject.values) + x2 = lines[1].get_xdata() + assert_array_equal(x2, s1.index.asobject.values) + plt.close('all') + + @slow + def test_mixed_freq_hf_first(self): + import matplotlib.pyplot as plt + plt.close('all') + idxh = date_range('1/1/1999', periods=365, freq='D') + idxl = date_range('1/1/1999', periods=12, freq='M') + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + high.plot() + ax = low.plot() + for l in ax.get_lines(): + self.assert_(l.get_xdata().freq == 'D') + + @slow + def test_mixed_freq_lf_first(self): + import matplotlib.pyplot as plt + plt.close('all') + idxh = date_range('1/1/1999', periods=365, freq='D') + idxl = date_range('1/1/1999', periods=12, freq='M') + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + low.plot() + ax = high.plot() + for l in ax.get_lines(): + self.assert_(l.get_xdata().freq == 'M') + + @slow + def test_mixed_freq_irreg_period(self): + ts = tm.makeTimeSeries() + irreg = ts[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 16, 17, 18, 29]] + rng = period_range('1/3/2000', periods=30, freq='B') + ps = Series(np.random.randn(len(rng)), rng) + irreg.plot() + ps.plot() PNG_PATH = 'tmp.png' def _check_plot_works(f, freq=None, series=None, *args, **kwargs): @@ -466,19 +524,22 @@ def _check_plot_works(f, freq=None, series=None, *args, **kwargs): fig = plt.gcf() plt.clf() ax = fig.add_subplot(211) + orig_ax = kwargs.pop('ax', plt.gca()) + orig_axfreq = getattr(orig_ax, 'freq', None) + ret = f(*args, **kwargs) assert(ret is not None) # do something more intelligent - orig_ax = kwargs.pop('ax', plt.gca()) - if series is not None: # non-business + ax = kwargs.pop('ax', plt.gca()) + if series is not None: dfreq = series.index.freq if isinstance(dfreq, DateOffset): dfreq = dfreq.rule_code - #dfreq = frequencies.offset_to_period_alias(dfreq) - assert(orig_ax.freq == dfreq) + if orig_axfreq is None: + assert(ax.freq == dfreq) - if freq is not None: - assert(orig_ax.freq == freq) + if freq is not None and orig_axfreq is None: + assert(ax.freq == freq) ax = fig.add_subplot(212) try: From 07e4ecd5c99ba799eebc3e31338e81780eb616b7 Mon Sep 17 00:00:00 2001 From: Chang She Date: Sun, 24 Jun 2012 15:10:21 -0400 Subject: [PATCH 25/42] ENH: handle weekly resampling via daily --- pandas/tseries/plotting.py | 6 +++++- pandas/tseries/tests/test_plotting.py | 26 ++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/pandas/tseries/plotting.py b/pandas/tseries/plotting.py index 2e0506e6c5fc2..3e3540271153a 100644 --- a/pandas/tseries/plotting.py +++ b/pandas/tseries/plotting.py @@ -57,10 +57,14 @@ def tsplot(series, plotf, **kwargs): ax_freq = getattr(ax, 'freq', None) if (ax_freq is not None) and (freq != ax_freq): if frequencies.is_subperiod(freq, ax_freq): # downsample - how = kwargs.pop('how', 'first') + how = kwargs.pop('how', 'last') series = series.resample(ax_freq, how=how) elif frequencies.is_superperiod(freq, ax_freq): series = series.resample(ax_freq) + else: # one freq is weekly + how = kwargs.pop('how', 'last') + series = series.resample('D', how=how, fill_method='pad') + series = series.resample(ax_freq, how=how, fill_method='pad') freq = ax_freq # Convert DatetimeIndex to PeriodIndex diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tseries/tests/test_plotting.py index e0a8acacf2df3..718249330ec92 100644 --- a/pandas/tseries/tests/test_plotting.py +++ b/pandas/tseries/tests/test_plotting.py @@ -517,6 +517,32 @@ def test_mixed_freq_irreg_period(self): irreg.plot() ps.plot() + @slow + def test_to_weekly_resampling(self): + import matplotlib.pyplot as plt + plt.close('all') + idxh = date_range('1/1/1999', periods=52, freq='W') + idxl = date_range('1/1/1999', periods=12, freq='M') + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + high.plot() + ax = low.plot() + for l in ax.get_lines(): + self.assert_(l.get_xdata().freq.startswith('W')) + + @slow + def test_from_weekly_resampling(self): + import matplotlib.pyplot as plt + plt.close('all') + idxh = date_range('1/1/1999', periods=52, freq='W') + idxl = date_range('1/1/1999', periods=12, freq='M') + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + low.plot() + ax = high.plot() + for l in ax.get_lines(): + self.assert_(l.get_xdata().freq == 'M') + PNG_PATH = 'tmp.png' def _check_plot_works(f, freq=None, series=None, *args, **kwargs): import matplotlib.pyplot as plt From 791e7042b43b7a4246b9736d5c8cd1d7f5a4df1b Mon Sep 17 00:00:00 2001 From: Chang She Date: Sun, 24 Jun 2012 19:39:08 -0400 Subject: [PATCH 26/42] ENH: column aliases for to_csv/to_excel #921 --- pandas/core/frame.py | 31 ++++++++++++++++++++++--------- pandas/tests/test_frame.py | 19 +++++++++++++++++++ 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b51fab43c9e33..c8cbed8fee44f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1024,7 +1024,8 @@ def to_panel(self): to_wide = deprecate('to_wide', to_panel) - def _helper_csvexcel(self, writer, na_rep=None, cols=None, header=True, + def _helper_csvexcel(self, writer, na_rep=None, cols=None, + col_aliases=None, header=True, index=True, index_label=None): if cols is None: cols = self.columns @@ -1053,7 +1054,15 @@ def _helper_csvexcel(self, writer, na_rep=None, cols=None, header=True, index_label = [index_label] encoded_labels = list(index_label) - encoded_cols = list(cols) + if col_aliases is not None: + if len(col_aliases) != len(cols): + raise ValueError(('Writing %d cols but got %d aliases' + % (len(cols), len(col_aliases)))) + else: + write_cols = col_aliases + else: + write_cols = cols + encoded_cols = list(write_cols) writer.writerow(encoded_labels + encoded_cols) else: @@ -1078,8 +1087,8 @@ def _helper_csvexcel(self, writer, na_rep=None, cols=None, header=True, writer.writerow(row_fields) def to_csv(self, path_or_buf, sep=",", na_rep='', cols=None, - header=True, index=True, index_label=None, mode='w', - nanRep=None, encoding=None): + col_aliases=None, header=True, index=True, index_label=None, + mode='w', nanRep=None, encoding=None): """ Write DataFrame to a comma-separated values (csv) file @@ -1091,6 +1100,8 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', cols=None, Missing data representation cols : sequence, optional Columns to write + col_aliases : sequence, default None + Optional column aliases to be written instead of column names header : boolean, default True Write out column names index : boolean, default True @@ -1126,14 +1137,16 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', cols=None, else: csvout = csv.writer(f, lineterminator='\n', delimiter=sep) self._helper_csvexcel(csvout, na_rep=na_rep, cols=cols, - header=header, index=index, - index_label=index_label) + col_aliases=col_aliases, header=header, + index=index, index_label=index_label) + finally: if close: f.close() def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', - cols=None, header=True, index=True, index_label=None): + cols=None, col_aliases=None, header=True, index=True, + index_label=None): """ Write DataFrame to a excel sheet @@ -1173,8 +1186,8 @@ def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', need_save = True excel_writer.cur_sheet = sheet_name self._helper_csvexcel(excel_writer, na_rep=na_rep, cols=cols, - header=header, index=index, - index_label=index_label) + col_aliases=col_aliases, header=header, + index=index, index_label=index_label) if need_save: excel_writer.save() diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 15d465e8df966..273e5bbd75ab7 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -3100,6 +3100,16 @@ def test_to_csv_from_csv(self): result = DataFrame.from_csv(path, index_col=[0, 1, 2], parse_dates=False) assert_frame_equal(result, df) + + # column aliases + col_aliases = Index(['AA', 'X', 'Y', 'Z']) + self.frame2.to_csv(path, header=True, col_aliases=col_aliases) + rs = DataFrame.from_csv(path) + xp = self.frame2.copy() + xp.columns = col_aliases + + assert_frame_equal(xp, rs) + os.remove(path) def test_to_csv_multiindex(self): @@ -3284,6 +3294,15 @@ def test_to_excel_from_excel(self): np.testing.assert_equal('test1', reader.sheet_names[0]) np.testing.assert_equal('test2', reader.sheet_names[1]) + # column aliases + col_aliases = Index(['AA', 'X', 'Y', 'Z']) + self.frame2.to_excel(path, 'test1', header=True, + col_aliases=col_aliases) + reader = ExcelFile(path) + rs = reader.parse('test1') + xp = self.frame2.copy() + xp.columns = col_aliases + assert_frame_equal(xp, rs) os.remove(path) From 7923fa5466320551269cbfb4cd4492c135a553cd Mon Sep 17 00:00:00 2001 From: Chang She Date: Mon, 25 Jun 2012 10:07:12 -0400 Subject: [PATCH 27/42] overload header keyword instead of extra col_aliases keyword --- pandas/core/frame.py | 40 +++++++++++++++++++------------------- pandas/tests/test_frame.py | 10 ++++++---- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c8cbed8fee44f..a517df708fdd4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1025,15 +1025,16 @@ def to_panel(self): to_wide = deprecate('to_wide', to_panel) def _helper_csvexcel(self, writer, na_rep=None, cols=None, - col_aliases=None, header=True, - index=True, index_label=None): + header=True, index=True, index_label=None): if cols is None: cols = self.columns series = {} for k, v in self._series.iteritems(): series[k] = v.values - if header: + + has_aliases = isinstance(header, (tuple, list, np.ndarray)) + if has_aliases or header: if index: # should write something for index label if index_label is None: @@ -1054,12 +1055,12 @@ def _helper_csvexcel(self, writer, na_rep=None, cols=None, index_label = [index_label] encoded_labels = list(index_label) - if col_aliases is not None: - if len(col_aliases) != len(cols): + if has_aliases: + if len(header) != len(cols): raise ValueError(('Writing %d cols but got %d aliases' - % (len(cols), len(col_aliases)))) + % (len(cols), len(header)))) else: - write_cols = col_aliases + write_cols = header else: write_cols = cols encoded_cols = list(write_cols) @@ -1087,7 +1088,7 @@ def _helper_csvexcel(self, writer, na_rep=None, cols=None, writer.writerow(row_fields) def to_csv(self, path_or_buf, sep=",", na_rep='', cols=None, - col_aliases=None, header=True, index=True, index_label=None, + header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None): """ Write DataFrame to a comma-separated values (csv) file @@ -1100,10 +1101,9 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', cols=None, Missing data representation cols : sequence, optional Columns to write - col_aliases : sequence, default None - Optional column aliases to be written instead of column names - header : boolean, default True - Write out column names + header : boolean or list of string, default True + Write out column names. If a list of string is given it is + assumed to be aliases for the column names index : boolean, default True Write row names (index) index_label : string or sequence, default None @@ -1137,16 +1137,15 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', cols=None, else: csvout = csv.writer(f, lineterminator='\n', delimiter=sep) self._helper_csvexcel(csvout, na_rep=na_rep, cols=cols, - col_aliases=col_aliases, header=header, - index=index, index_label=index_label) + header=header, index=index, + index_label=index_label) finally: if close: f.close() def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', - cols=None, col_aliases=None, header=True, index=True, - index_label=None): + cols=None, header=True, index=True, index_label=None): """ Write DataFrame to a excel sheet @@ -1160,8 +1159,9 @@ def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', Missing data rep'n cols : sequence, optional Columns to write - header : boolean, default True - Write out column names + header : boolean or list of string, default True + Write out column names. If a list of string is given it is + assumed to be aliases for the column names index : boolean, default True Write row names (index) index_label : string or sequence, default None @@ -1186,8 +1186,8 @@ def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', need_save = True excel_writer.cur_sheet = sheet_name self._helper_csvexcel(excel_writer, na_rep=na_rep, cols=cols, - col_aliases=col_aliases, header=header, - index=index, index_label=index_label) + header=header, index=index, + index_label=index_label) if need_save: excel_writer.save() diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 273e5bbd75ab7..8c1c2169ed0d7 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -3103,13 +3103,16 @@ def test_to_csv_from_csv(self): # column aliases col_aliases = Index(['AA', 'X', 'Y', 'Z']) - self.frame2.to_csv(path, header=True, col_aliases=col_aliases) + self.frame2.to_csv(path, header=col_aliases) rs = DataFrame.from_csv(path) xp = self.frame2.copy() xp.columns = col_aliases assert_frame_equal(xp, rs) + self.assertRaises(ValueError, self.frame2.to_csv, path, + header=['AA', 'X']) + os.remove(path) def test_to_csv_multiindex(self): @@ -3296,10 +3299,9 @@ def test_to_excel_from_excel(self): # column aliases col_aliases = Index(['AA', 'X', 'Y', 'Z']) - self.frame2.to_excel(path, 'test1', header=True, - col_aliases=col_aliases) + self.frame2.to_excel(path, 'test1', header=col_aliases) reader = ExcelFile(path) - rs = reader.parse('test1') + rs = reader.parse('test1', index_col=0) xp = self.frame2.copy() xp.columns = col_aliases assert_frame_equal(xp, rs) From 0d91eda8edd6a7ce73c2414eee0137be107c8a38 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 25 Jun 2012 12:23:04 -0400 Subject: [PATCH 28/42] DOC: release notes re: #921 --- RELEASE.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/RELEASE.rst b/RELEASE.rst index 79b980694860a..af0d06caa1825 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -132,6 +132,8 @@ pandas 0.8.0 404s (#826) - Improve performance of reshaping by using O(N) categorical sorting - Series names will be used for index of DataFrame if no index passed (#1494) + - Header argument in DataFrame.to_csv can accept a list of column names to + use instead of the object's columns (#921) **API Changes** From 0b30cacaa643e0bc911d676d96924292fd879692 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Mon, 25 Jun 2012 09:59:52 -0400 Subject: [PATCH 29/42] ENH: Add raise on conflict keyword to update --- pandas/core/frame.py | 15 ++++++++++++++- pandas/tests/test_frame.py | 11 +++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a517df708fdd4..9a3c47302707f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -166,6 +166,9 @@ merged : DataFrame """ +# Custom error class for update + +class DataConflictError(Exception): pass #---------------------------------------------------------------------- # Factory helper methods @@ -3168,7 +3171,8 @@ def combine_first(self, other): combiner = lambda x, y: np.where(isnull(x), y, x) return self.combine(other, combiner) - def update(self, other, join='left', overwrite=True, filter_func=None): + def update(self, other, join='left', overwrite=True, filter_func=None, + raise_conflict=False): """ Modify DataFrame in place using non-NA values from passed DataFrame. Aligns on indices @@ -3182,6 +3186,9 @@ def update(self, other, join='left', overwrite=True, filter_func=None): filter_func : callable(1d-array) -> 1d-array, default None Can choose to replace values other than NA. Return True for values that should be updated + raise_conflict : bool + If True, will raise an error if the DataFrame and other both + contain data in the same place. """ if join != 'left': raise NotImplementedError @@ -3193,6 +3200,12 @@ def update(self, other, join='left', overwrite=True, filter_func=None): if filter_func is not None: mask = -filter_func(this) | isnull(that) else: + if raise_conflict: + mask_this = notnull(that) + mask_that = notnull(this) + if any(mask_this & mask_that): + raise DataConflictError("Data overlaps.") + if overwrite: mask = isnull(that) else: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 8c1c2169ed0d7..6ccd13911d24f 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -5347,6 +5347,17 @@ def test_update_filtered(self): [1.5, nan, 7.]]) assert_frame_equal(df, expected) + def test_update_raise(self): + df = DataFrame([[1.5, 1, 3.], + [1.5, nan, 3.], + [1.5, nan, 3], + [1.5, nan, 3]]) + + other = DataFrame([[2., nan], + [nan, 7]], index=[1, 3], columns=[1,2]) + + np.testing.assert_raises(Exception, df.update, *(other,), + **{'raise_conflict' : True}) def test_combineAdd(self): # trivial From 7a233cd402590a42b3f8ede7c679f8d06fb9b6ef Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 25 Jun 2012 12:35:53 -0400 Subject: [PATCH 30/42] DOC: release notes --- RELEASE.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/RELEASE.rst b/RELEASE.rst index af0d06caa1825..d9e5f6abe4119 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -134,6 +134,7 @@ pandas 0.8.0 - Series names will be used for index of DataFrame if no index passed (#1494) - Header argument in DataFrame.to_csv can accept a list of column names to use instead of the object's columns (#921) + - Add ``raise_conflict`` argument to DataFrame.update (#1526) **API Changes** From 10284c15efa42aebbf9b2972e4e2f6c75a807126 Mon Sep 17 00:00:00 2001 From: Chang She Date: Mon, 25 Jun 2012 14:00:07 -0400 Subject: [PATCH 31/42] BUG: timedelta.total_seconds only in 2.7 and 3.2 --- pandas/tseries/offsets.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 8f62c3304cb53..f35a85970136a 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -280,7 +280,6 @@ def freqstr(self): return fstr def _offset_str(self): - def get_str(td): off_str = '' if td.days > 0: @@ -302,10 +301,10 @@ def get_str(td): return off_str if isinstance(self.offset, timedelta): - tot_sec = self.offset.total_seconds() - if tot_sec > 0: + zero = timedelta(0, 0, 0) + if self.offset >= zero: off_str = '+' + get_str(self.offset) - if tot_sec < 0: + else: off_str = '-' + get_str(-self.offset) return off_str else: From b225898728b708c22ee068ba499b052b692b3b7c Mon Sep 17 00:00:00 2001 From: Chang She Date: Mon, 25 Jun 2012 14:26:43 -0400 Subject: [PATCH 32/42] BUG: DataFrame plotting with inferred freq --- pandas/tools/plotting.py | 10 +++++++--- pandas/tseries/tests/test_plotting.py | 16 ++++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 30505cfc4d48c..985d58895d8f6 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -660,18 +660,22 @@ def _maybe_convert_index(self, data): if (isinstance(data.index, DatetimeIndex) and isinstance(data, DataFrame)): freq = getattr(data.index, 'freqstr', None) + + if freq is None: + freq = getattr(data.index, 'inferred_freq', None) + if isinstance(freq, DateOffset): freq = freq.rule_code freq = get_period_alias(freq) - if freq is None: - freq = getattr(data.index, 'inferred_freq', None) - if freq is None: ax, _ = self._get_ax_and_style(0) freq = getattr(ax, 'freq', None) + if freq is None: + raise ValueError('Could not get frequency alias for plotting') + data = DataFrame(data.values, index=data.index.to_period(freq=freq), columns=data.columns) diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tseries/tests/test_plotting.py index 718249330ec92..c88102a31c8f4 100644 --- a/pandas/tseries/tests/test_plotting.py +++ b/pandas/tseries/tests/test_plotting.py @@ -49,6 +49,22 @@ def setUp(self): columns=['A', 'B', 'C']) for x in idx] + @slow + def test_frame_inferred(self): + # inferred freq + import matplotlib.pyplot as plt + plt.close('all') + idx = date_range('1/1/1987', freq='MS', periods=100) + idx = DatetimeIndex(idx.values, freq=None) + df = DataFrame(np.random.randn(len(idx), 3), index=idx) + df.plot() + + # axes freq + idx = idx[0:40] + idx[45:99] + df2 = DataFrame(np.random.randn(len(idx), 3), index=idx) + df2.plot() + plt.close('all') + @slow def test_tsplot(self): from pandas.tseries.plotting import tsplot From 483348be900785905e3f6be195dfdce7c0571a44 Mon Sep 17 00:00:00 2001 From: Chang She Date: Mon, 25 Jun 2012 14:49:19 -0400 Subject: [PATCH 33/42] BUG: plotting DataFrame with freq with offset --- pandas/tools/plotting.py | 2 +- pandas/tseries/tests/test_plotting.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 985d58895d8f6..37b3f6e641202 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -659,7 +659,7 @@ def _maybe_convert_index(self, data): from pandas.core.frame import DataFrame if (isinstance(data.index, DatetimeIndex) and isinstance(data, DataFrame)): - freq = getattr(data.index, 'freqstr', None) + freq = getattr(data.index, 'freq', None) if freq is None: freq = getattr(data.index, 'inferred_freq', None) diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tseries/tests/test_plotting.py index c88102a31c8f4..6e246fb8ed0ec 100644 --- a/pandas/tseries/tests/test_plotting.py +++ b/pandas/tseries/tests/test_plotting.py @@ -42,7 +42,7 @@ def setUp(self): columns=['A', 'B', 'C']) for x in idx] - freq = ['S', 'T', 'H', 'D', 'W', 'M', 'Q-DEC', 'A'] + freq = ['S', 'T', 'H', 'D', 'W', 'M', 'Q-DEC', 'A', '1B30Min'] idx = [date_range('12/31/1999', freq=x, periods=100) for x in freq] self.datetime_ser = [Series(np.random.randn(len(x)), x) for x in idx] self.datetime_df = [DataFrame(np.random.randn(len(x), 3), index=x, From 6832a94e2c87267a0ae5d19e4ceddd5950a650b6 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 25 Jun 2012 17:44:34 -0400 Subject: [PATCH 34/42] ENH: experimental lreshape function --- pandas/core/api.py | 3 +- pandas/core/common.py | 12 +++++ pandas/core/reshape.py | 51 +++++++++++++++++++++ pandas/tests/test_reshape.py | 88 +++++++++++++++++++++++++++++++++++- 4 files changed, 151 insertions(+), 3 deletions(-) diff --git a/pandas/core/api.py b/pandas/core/api.py index df164575e3b04..8cf3b7f4cbda4 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -15,7 +15,8 @@ from pandas.core.frame import DataFrame from pandas.core.panel import Panel from pandas.core.groupby import groupby -from pandas.core.reshape import pivot_simple as pivot, get_dummies +from pandas.core.reshape import (pivot_simple as pivot, get_dummies, + lreshape) WidePanel = Panel diff --git a/pandas/core/common.py b/pandas/core/common.py index 7d94e55c6d597..4db15c6c69101 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -914,3 +914,15 @@ def writerow(self, row): self.stream.write(data) # empty queue self.queue.truncate(0) + + +_NS_DTYPE = np.dtype('M8[ns]') + +def _concat_compat(to_concat): + if all(x.dtype == _NS_DTYPE for x in to_concat): + # work around NumPy 1.6 bug + new_values = np.concatenate([x.view(np.int64) for x in to_concat]) + return new_values.view(_NS_DTYPE) + else: + return np.concatenate(to_concat) + diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 08d45ddf63bd5..8703de4b31c0d 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -12,6 +12,7 @@ from pandas.core.common import notnull, _ensure_platform_int from pandas.core.groupby import (get_group_index, _compress_group_index, decons_group_index) +import pandas.core.common as com import pandas.lib as lib @@ -559,6 +560,56 @@ def melt(frame, id_vars=None, value_vars=None): mdata['variable'] = np.asarray(frame.columns).repeat(N) return DataFrame(mdata, columns=mcolumns) + +def lreshape(data, groups, dropna=True, label=None): + """ + Reshape long-format data to wide. Generalized inverse of DataFrame.pivot + + Parameters + ---------- + data : + groups : dict + na_action : + + Returns + ------- + reshaped : DataFrame + """ + if isinstance(groups, dict): + keys = groups.keys() + values = groups.values() + else: + keys, values = zip(*groups) + + all_cols = list(set.union(*[set(x) for x in values])) + id_cols = list(data.columns.diff(all_cols)) + + K = len(values[0]) + + for seq in values: + if len(seq) != K: + raise ValueError('All column lists must be same length') + + mdata = {} + pivot_cols = [] + + for target, names in zip(keys, values): + mdata[target] = com._concat_compat([data[col].values for col in names]) + pivot_cols.append(target) + + for col in id_cols: + mdata[col] = np.tile(data[col].values, K) + + if dropna: + mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool) + for c in pivot_cols: + mask &= notnull(mdata[c]) + if not mask.all(): + mdata = dict((k, v[mask]) for k, v in mdata.iteritems()) + + return DataFrame(mdata, columns=id_cols + pivot_cols) + + def convert_dummies(data, cat_variables, prefix_sep='_'): """ Compute DataFrame with specified columns converted to dummy variables (0 / diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 036ee8677b41c..e37fd13b5758b 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -1,8 +1,20 @@ +# pylint: disable-msg=W0612,E1101 +from copy import deepcopy +from datetime import datetime, timedelta +from StringIO import StringIO +import cPickle as pickle +import operator +import os +import unittest + +import nose + from pandas import DataFrame +from numpy import nan import numpy as np -from pandas.core.reshape import melt, convert_dummies +from pandas.core.reshape import melt, convert_dummies, lreshape import pandas.util.testing as tm def test_melt(): @@ -38,8 +50,80 @@ def test_convert_dummies(): tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected2) +class Test_lreshape(unittest.TestCase): + + def test_pairs(self): + data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', + '21dec2008', '11jan2009'], + 'birthwt': [1766, 3301, 1454, 3139, 4133], + 'id': [101, 102, 103, 104, 105], + 'sex': ['Male', 'Female', 'Female', 'Female', 'Female'], + 'visitdt1': ['11jan2009', '22dec2008', '04jan2009', + '29dec2008', '20jan2009'], + 'visitdt2': ['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'], + 'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'], + 'wt1': [1823, 3338, 1549, 3298, 4306], + 'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0], + 'wt3': [2293.0, nan, nan, 3377.0, 4805.0]} + + df = DataFrame(data) + + spec = {'visitdt': ['visitdt%d' % i for i in range(1, 4)], + 'wt': ['wt%d' % i for i in range(1, 4)]} + result = lreshape(df, spec) + + exp_data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', + '21dec2008', '11jan2009', '08jan2009', + '30dec2008', '21dec2008', '11jan2009', + '08jan2009', '21dec2008', '11jan2009'], + 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, + 1454, 3139, 4133, 1766, 3139, 4133], + 'id': [101, 102, 103, 104, 105, 101, + 103, 104, 105, 101, 104, 105], + 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', + 'Male', 'Female', 'Female', 'Female', 'Male', + 'Female', 'Female'], + 'visitdt': ['11jan2009', '22dec2008', '04jan2009', '29dec2008', + '20jan2009', '21jan2009', '22jan2009', '31dec2008', + '03feb2009', '05feb2009', '02jan2009', '15feb2009'], + 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, + 1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0]} + exp = DataFrame(exp_data, columns=result.columns) + tm.assert_frame_equal(result, exp) + + result = lreshape(df, spec, dropna=False) + exp_data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', + '21dec2008', '11jan2009', + '08jan2009', '20dec2008', '30dec2008', + '21dec2008', '11jan2009', + '08jan2009', '20dec2008', '30dec2008', + '21dec2008', '11jan2009'], + 'birthwt': [1766, 3301, 1454, 3139, 4133, + 1766, 3301, 1454, 3139, 4133, + 1766, 3301, 1454, 3139, 4133], + 'id': [101, 102, 103, 104, 105, + 101, 102, 103, 104, 105, + 101, 102, 103, 104, 105], + 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', + 'Male', 'Female', 'Female', 'Female', 'Female', + 'Male', 'Female', 'Female', 'Female', 'Female'], + 'visitdt': ['11jan2009', '22dec2008', '04jan2009', + '29dec2008', '20jan2009', + '21jan2009', nan, '22jan2009', + '31dec2008', '03feb2009', + '05feb2009', nan, nan, '02jan2009', '15feb2009'], + 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, + nan, 1892.0, 3338.0, 4575.0, 2293.0, nan, nan, + 3377.0, 4805.0]} + exp = DataFrame(exp_data, columns=result.columns) + tm.assert_frame_equal(result, exp) + + spec = {'visitdt': ['visitdt%d' % i for i in range(1, 3)], + 'wt': ['wt%d' % i for i in range(1, 4)]} + self.assertRaises(ValueError, lreshape, df, spec) + + if __name__ == '__main__': - import nose nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], exit=False) From 0ade7fa670981b8e11239c1b2c39dd44c7cf883d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 25 Jun 2012 18:05:16 -0400 Subject: [PATCH 35/42] DOC: lreshape docstring, release note --- RELEASE.rst | 1 + pandas/core/reshape.py | 20 ++++++++++++++++++-- pandas/tseries/index.py | 12 ++---------- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index d9e5f6abe4119..c5bb354c8bdc9 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -89,6 +89,7 @@ pandas 0.8.0 - Add support for Categorical use in GroupBy (#292) - Add ``any`` and ``all`` methods to DataFrame (#1416) - Add ``secondary_y`` option to Series.plot + - Add experimental ``lreshape`` function for reshaping wide to long **Improvements to existing features** diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 8703de4b31c0d..475a6822edfea 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -567,9 +567,25 @@ def lreshape(data, groups, dropna=True, label=None): Parameters ---------- - data : + data : DataFrame groups : dict - na_action : + {new_name : list_of_columns} + dropna : boolean, default True + + Examples + -------- + >>> data + hr1 hr2 team year1 year2 + 0 514 545 Red Sox 2007 2008 + 1 573 526 Yankees 2007 2008 + + >>> pd.lreshape(data, {'year': ['year1', 'year2'], + 'hr': ['hr1', 'hr2']}) + team hr year + 0 Red Sox 514 2007 + 1 Yankees 573 2007 + 2 Red Sox 545 2008 + 3 Yankees 526 2008 Returns ------- diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 1c4af41ff90e3..ae212902fb494 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -484,7 +484,7 @@ def append(self, other): to_concat = [x.values if isinstance(x, Index) else x for x in to_concat] - return Index(_concat(to_concat), name=name) + return Index(com._concat_compat(to_concat), name=name) def get_duplicates(self): values = Index.get_duplicates(self) @@ -775,7 +775,7 @@ def _fast_union(self, other): if left_end < right_end: loc = right.searchsorted(left_end, side='right') right_chunk = right.values[loc:] - dates = _concat((left.values, right_chunk)) + dates = com._concat_compat((left.values, right_chunk)) return self._view_like(dates) else: return left @@ -1371,11 +1371,3 @@ def _time_to_micros(time): return 1000000 * seconds + time.microsecond -def _concat(to_concat): - if all(x.dtype == _NS_DTYPE for x in to_concat): - # work around NumPy 1.6 bug - new_values = np.concatenate([x.view('i8') for x in to_concat]) - return new_values.view(_NS_DTYPE) - else: - return np.concatenate(to_concat) - From 3453d446d6e1755c807b6b293f020bb2ca5da81c Mon Sep 17 00:00:00 2001 From: Chang She Date: Sun, 24 Jun 2012 16:34:19 -0400 Subject: [PATCH 36/42] BUG: workaround vstack/concat bug in numpy 1.6 #1518 --- pandas/core/internals.py | 10 +++++++++- pandas/tests/test_frame.py | 32 +++++++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 83278e3c994d0..1966b51f5b2e7 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1389,7 +1389,7 @@ def _consolidate(blocks, items): def _merge_blocks(blocks, items): if len(blocks) == 1: return blocks[0] - new_values = np.vstack([b.values for b in blocks]) + new_values = _vstack([b.values for b in blocks]) new_items = blocks[0].items.append([b.items for b in blocks[1:]]) new_block = make_block(new_values, new_items, items, do_integrity_check=True) @@ -1422,3 +1422,11 @@ def _union_items_slow(all_items): else: seen = seen.union(items) return seen + +def _vstack(to_stack): + if all(x.dtype == _NS_DTYPE for x in to_stack): + # work around NumPy 1.6 bug + new_values = np.vstack([x.view('i8') for x in to_stack]) + return new_values.view(_NS_DTYPE) + else: + return np.vstack(to_stack) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 6ccd13911d24f..e3794222d0481 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -13,6 +13,7 @@ from numpy.random import randn import numpy as np import numpy.ma as ma +from numpy.testing import assert_array_equal import pandas as pan import pandas.core.nanops as nanops @@ -21,7 +22,7 @@ import pandas.core.datetools as datetools from pandas.core.api import (DataFrame, Index, Series, notnull, isnull, MultiIndex, DatetimeIndex) -from pandas.io.parsers import (ExcelFile, ExcelWriter) +from pandas.io.parsers import (ExcelFile, ExcelWriter, read_csv) from pandas.util.testing import (assert_almost_equal, assert_series_equal, @@ -6393,6 +6394,35 @@ def test_any_all(self): self._check_bool_op('any', np.any, has_skipna=True, has_bool_only=True) self._check_bool_op('all', np.all, has_skipna=True, has_bool_only=True) + def test_consolidate_datetime64(self): + # numpy vstack bug + + data = """\ +starting,ending,measure +2012-06-21 00:00,2012-06-23 07:00,77 +2012-06-23 07:00,2012-06-23 16:30,65 +2012-06-23 16:30,2012-06-25 08:00,77 +2012-06-25 08:00,2012-06-26 12:00,0 +2012-06-26 12:00,2012-06-27 08:00,77 +""" + df = read_csv(StringIO(data), parse_dates=[0,1]) + + ser_starting = df.starting + ser_starting.index = ser_starting.values + ser_starting = ser_starting.tz_localize('US/Eastern') + ser_starting = ser_starting.tz_convert('UTC') + + ser_ending = df.ending + ser_ending.index = ser_ending.values + ser_ending = ser_ending.tz_localize('US/Eastern') + ser_ending = ser_ending.tz_convert('UTC') + + df.starting = ser_starting.index + df.ending = ser_ending.index + + assert_array_equal(df.starting.values, ser_starting.index.values) + assert_array_equal(df.ending.values, ser_ending.index.values) + def _check_bool_op(self, name, alternative, frame=None, has_skipna=True, has_bool_only=False): if frame is None: From eb7a751fa3396d1e194f319128be7dd4513512b7 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 25 Jun 2012 18:32:55 -0400 Subject: [PATCH 37/42] BUG: repr of pre-1900 datetime64 values in a DataFrame column close #1518 --- pandas/core/format.py | 14 +------------- pandas/src/datetime.pyx | 24 +++++++++++++++--------- pandas/tests/test_reshape.py | 1 - pandas/tseries/tests/test_timeseries.py | 6 ++++++ 4 files changed, 22 insertions(+), 23 deletions(-) diff --git a/pandas/core/format.py b/pandas/core/format.py index 59ab7b473d99a..776a0fd80291f 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -594,19 +594,7 @@ def _format_datetime64(x, tz=None): return 'NaT' stamp = lib.Timestamp(x, tz=tz) - base = stamp.strftime('%Y-%m-%d %H:%M:%S') - - fraction = stamp.microsecond * 1000 + stamp.nanosecond - digits = 9 - - if fraction == 0: - return base - - while (fraction % 10) == 0: - fraction /= 10 - digits -= 1 - - return base + ('.%%.%id' % digits) % fraction + return stamp._repr_base def _make_fixed_width(strings, justify='right'): diff --git a/pandas/src/datetime.pyx b/pandas/src/datetime.pyx index 17f35a73501aa..f7e325c4ae718 100644 --- a/pandas/src/datetime.pyx +++ b/pandas/src/datetime.pyx @@ -102,15 +102,7 @@ class Timestamp(_Timestamp): return ts_base def __repr__(self): - result = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (self.year, self.month, - self.day, self.hour, - self.minute, self.second) - - if self.nanosecond != 0: - nanos = self.nanosecond + 1000 * self.microsecond - result += '.%.9d' % nanos - elif self.microsecond != 0: - result += '.%.6d' % self.microsecond + result = self._repr_base try: result += self.strftime('%z') @@ -124,6 +116,20 @@ class Timestamp(_Timestamp): return '' % result + @property + def _repr_base(self): + result = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (self.year, self.month, + self.day, self.hour, + self.minute, self.second) + + if self.nanosecond != 0: + nanos = self.nanosecond + 1000 * self.microsecond + result += '.%.9d' % nanos + elif self.microsecond != 0: + result += '.%.6d' % self.microsecond + + return result + @property def tz(self): """ diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index e37fd13b5758b..3cb600ff69513 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -126,4 +126,3 @@ def test_pairs(self): if __name__ == '__main__': nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], exit=False) - diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index f04f0a5b8259c..5f6a00a27699f 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -384,6 +384,12 @@ def test_frame_add_datetime64_column(self): df['A'] = rng self.assert_(np.issubdtype(df['A'].dtype, np.dtype('M8[ns]'))) + def test_frame_datetime64_pre1900_repr(self): + df = DataFrame({'year': date_range('1/1/1700', periods=50, + freq='A-DEC')}) + # it works! + repr(df) + def test_frame_add_datetime64_col_other_units(self): n = 100 From 462c731ece74ad0886188ac1f2c100baf4a15dc9 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 25 Jun 2012 18:34:47 -0400 Subject: [PATCH 38/42] DOC: small doc for #1450 --- doc/source/io.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 50a90f601c573..8c67b7d96d395 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -83,8 +83,10 @@ data into a DataFrame object. They can take a number of arguments: as the index. - ``names``: List of column names to use. If passed, header will be implicitly set to None. - - ``na_values``: optional list of strings to recognize as NaN (missing values), - in addition to a default set. + - ``na_values``: optional list of strings to recognize as NaN (missing + values), in addition to a default set. If you pass an empty list or an + empty list for a particular column, no values (including empty strings) + will be considered NA - ``parse_dates``: if True then index will be parsed as dates (False by default). You can specify more complicated options to parse a subset of columns or a combination of columns into a single date column From 4c860de3486e7410321c3403f655689aa73c4cc5 Mon Sep 17 00:00:00 2001 From: Chang She Date: Tue, 26 Jun 2012 11:52:07 -0400 Subject: [PATCH 39/42] ENH: handle datetime.date in Period constructor --- pandas/tseries/period.py | 7 +++++-- pandas/tseries/tests/test_period.py | 10 +++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 22e571acef416..0a8c54e1c257f 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -1,6 +1,5 @@ # pylint: disable=E1101,E1103,W0232 - -from datetime import datetime +from datetime import datetime, date import numpy as np from pandas.tseries.frequencies import (get_freq_code as _gfc, to_offset, @@ -104,6 +103,10 @@ def __init__(self, value=None, freq=None, ordinal=None, dt = value if freq is None: raise ValueError('Must supply freq for datetime value') + elif isinstance(value, date): + dt = datetime(year=value.year, month=value.month, day=value.day) + if freq is None: + raise ValueError('Must supply freq for datetime value') else: msg = "Value must be Period, string, integer, or datetime" raise ValueError(msg) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index c65615cd1d03e..fb09fec8bfd1d 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -7,7 +7,7 @@ """ from unittest import TestCase -from datetime import datetime, timedelta +from datetime import datetime, date, timedelta import unittest from numpy.ma.testutils import assert_equal @@ -168,6 +168,14 @@ def test_period_constructor(self): i1 = Period(ordinal=200701, freq='M') self.assertEqual(i1.year, 18695) + i1 = Period(datetime(2007, 1, 1), freq='M') + i2 = Period('200701', freq='M') + self.assertEqual(i1, i2) + + i1 = Period(date(2007, 1, 1), freq='M') + i2 = Period(datetime(2007, 1, 1), freq='M') + self.assertEqual(i1, i2) + self.assertRaises(ValueError, Period, ordinal=200701) def test_freq_str(self): From 14759715101b0d38dedebd2588c587f6f1c11aee Mon Sep 17 00:00:00 2001 From: Chang She Date: Tue, 26 Jun 2012 11:57:34 -0400 Subject: [PATCH 40/42] ENH: register converters with matplotlib for better datetime convesion --- pandas/tools/plotting.py | 53 +++++++++++++++++---------- pandas/tseries/plotting.py | 23 +++++++++--- pandas/tseries/tests/test_plotting.py | 21 ++++++++++- 3 files changed, 70 insertions(+), 27 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 37b3f6e641202..1f116529def2d 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -1,6 +1,7 @@ # being a bit too dynamic # pylint: disable=E1101 from itertools import izip +import datetime import numpy as np @@ -13,6 +14,7 @@ from pandas.tseries.frequencies import get_period_alias, get_base_alias from pandas.tseries.offsets import DateOffset import pandas.tseries.tools as datetools +import pandas.lib as lib def _get_standard_kind(kind): return {'density' : 'kde'}.get(kind, kind) @@ -573,24 +575,39 @@ def _post_plot_logic(self): if self.subplots and self.legend: self.axes[0].legend(loc='best') -class DatetimeConverter(object): - - @classmethod - def convert(cls, values, units, axis): - def try_parse(values): - try: - return datetools.to_datetime(values).toordinal() - except Exception: +try: + import matplotlib.units as units + import matplotlib.dates as dates + + class DatetimeConverter(dates.DateConverter): + + @staticmethod + def convert(values, unit, axis): + def try_parse(values): + try: + return datetools.to_datetime(values).toordinal() + except Exception: + return values + + if isinstance(values, (datetime.datetime, datetime.date)): + return values.toordinal() + elif isinstance(values, (datetime.time)): + return dates.date2num(values) + elif (com.is_integer(values) or com.is_float(values)): return values - - if (com.is_integer(values) or - com.is_float(values)): + elif isinstance(values, str): + return try_parse(values) + elif isinstance(values, Index): + return values.map(try_parse) + elif isinstance(values, (list, tuple, np.ndarray)): + return [try_parse(x) for x in values] return values - elif isinstance(values, str): - return try_parse(values) - elif isinstance(values, Index): - return values.map(try_parse) - return map(try_parse, values) + + units.registry[lib.Timestamp] = DatetimeConverter() + units.registry[datetime.date] = DatetimeConverter() + units.registry[datetime.datetime] = DatetimeConverter() +except ImportError: + pass class LinePlot(MPLPlot): @@ -648,10 +665,6 @@ def _make_plot(self): y = np.ma.masked_where(mask, y) plotf(ax, x, y, style, label=label, **self.kwds) ax.grid(self.grid) - idx = getattr(self.data, 'index', None) - if isinstance(idx, DatetimeIndex) or (idx is not None and - idx.inferred_type == 'datetime'): - ax.get_xaxis().converter = DatetimeConverter def _maybe_convert_index(self, data): # tsplot converts automatically, but don't want to convert index diff --git a/pandas/tseries/plotting.py b/pandas/tseries/plotting.py index 3e3540271153a..152f677a5b03e 100644 --- a/pandas/tseries/plotting.py +++ b/pandas/tseries/plotting.py @@ -77,7 +77,6 @@ def tsplot(series, plotf, **kwargs): ax.freq = freq xaxis = ax.get_xaxis() xaxis.freq = freq - xaxis.converter = DateConverter ax.legendlabels = [kwargs.get('label', None)] ax.view_interval = None ax.date_axis_info = None @@ -739,12 +738,24 @@ def format_dateaxis(subplot, freq): subplot.xaxis.set_minor_formatter(minformatter) pylab.draw_if_interactive() -class DateConverter(object): - @classmethod - def convert(cls, values, units, axis): - if isinstance(values, (int, float, str, datetime, Period)): +import matplotlib.units as units +import matplotlib.dates as dates + +class PeriodConverter(dates.DateConverter): + + @staticmethod + def convert(values, units, axis): + if not hasattr(axis, 'freq'): + raise TypeError('Axis must have `freq` set to convert to Periods') + valid_types = (str, datetime, Period, pydt.date, pydt.time) + if (isinstance(values, valid_types) or com.is_integer(values) or + com.is_float(values)): return get_datevalue(values, axis.freq) if isinstance(values, Index): return values.map(lambda x: get_datevalue(x, axis.freq)) - return map(lambda x: get_datevalue(x, axis.freq), values) + if isinstance(values, (list, tuple, np.ndarray)): + return [get_datevalue(x, axis.freq) for x in values] + return values + +units.registry[Period] = PeriodConverter() diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tseries/tests/test_plotting.py index 6e246fb8ed0ec..58ffd2c8c4e38 100644 --- a/pandas/tseries/tests/test_plotting.py +++ b/pandas/tseries/tests/test_plotting.py @@ -1,5 +1,5 @@ import os -from datetime import datetime, timedelta +from datetime import datetime, timedelta, date, time import unittest import nose @@ -559,6 +559,25 @@ def test_from_weekly_resampling(self): for l in ax.get_lines(): self.assert_(l.get_xdata().freq == 'M') + @slow + def test_irreg_dtypes(self): + #date + idx = [date(2000, 1, 1), date(2000, 1, 5), date(2000, 1, 20)] + df = DataFrame(np.random.randn(len(idx), 3), Index(idx, dtype=object)) + _check_plot_works(df.plot) + + #np.datetime64 + idx = date_range('1/1/2000', periods=10) + idx = idx[[0, 2, 5, 9]].asobject + df = DataFrame(np.random.randn(len(idx), 3), idx) + _check_plot_works(df.plot) + + #time + inc = Series(np.random.randint(1, 6, 9)).cumsum().values + idx = [time(1, 1, i) for i in inc] + df = DataFrame(np.random.randn(len(idx), 3), idx) + _check_plot_works(df.plot) + PNG_PATH = 'tmp.png' def _check_plot_works(f, freq=None, series=None, *args, **kwargs): import matplotlib.pyplot as plt From f89ced44269746125a06482d8947bde12fa373bc Mon Sep 17 00:00:00 2001 From: Chang She Date: Tue, 26 Jun 2012 15:23:41 -0400 Subject: [PATCH 41/42] BUG: incorrect tick label positions #1531 (zooming is still wrong) --- pandas/tools/plotting.py | 8 ++++++-- pandas/tseries/tests/test_plotting.py | 11 +++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 1f116529def2d..e5ba885c8e28e 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -461,9 +461,12 @@ def _adorn_subplots(self): self.ax.set_title(self.title) if self._need_to_set_index: - xticklabels = [_stringify(key) for key in self.data.index] + labels = [_stringify(key) for key in self.data.index] + labels = dict(zip(range(len(self.data.index)), labels)) + for ax_ in self.axes: # ax_.set_xticks(self.xticks) + xticklabels = [labels.get(x, '') for x in ax_.get_xticks()] ax_.set_xticklabels(xticklabels, rotation=self.rot) @property @@ -575,7 +578,7 @@ def _post_plot_logic(self): if self.subplots and self.legend: self.axes[0].legend(loc='best') -try: +try: # matplotlib is optional dependency import matplotlib.units as units import matplotlib.dates as dates @@ -663,6 +666,7 @@ def _make_plot(self): if mask.any(): y = np.ma.array(y) y = np.ma.masked_where(mask, y) + plotf(ax, x, y, style, label=label, **self.kwds) ax.grid(self.grid) diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tseries/tests/test_plotting.py index 58ffd2c8c4e38..af399ee83e185 100644 --- a/pandas/tseries/tests/test_plotting.py +++ b/pandas/tseries/tests/test_plotting.py @@ -561,6 +561,7 @@ def test_from_weekly_resampling(self): @slow def test_irreg_dtypes(self): + import matplotlib.pyplot as plt #date idx = [date(2000, 1, 1), date(2000, 1, 5), date(2000, 1, 20)] df = DataFrame(np.random.randn(len(idx), 3), Index(idx, dtype=object)) @@ -573,10 +574,16 @@ def test_irreg_dtypes(self): _check_plot_works(df.plot) #time - inc = Series(np.random.randint(1, 6, 9)).cumsum().values + plt.close('all') + inc = Series(np.random.randint(1, 15, 3)).cumsum().values idx = [time(1, 1, i) for i in inc] df = DataFrame(np.random.randn(len(idx), 3), idx) - _check_plot_works(df.plot) + ax = df.plot() + ticks = ax.get_xticks() + labels = ax.get_xticklabels() + td = dict(zip(ticks, labels)) + for i in range(3): + self.assert_(td[i].get_text() == str(idx[i])) PNG_PATH = 'tmp.png' def _check_plot_works(f, freq=None, series=None, *args, **kwargs): From 76c63517c8d5437abfdfcce958213cce01897c86 Mon Sep 17 00:00:00 2001 From: Chang She Date: Tue, 26 Jun 2012 20:21:46 -0400 Subject: [PATCH 42/42] BUG/TST: typo caused read_csv to lose index name #1536 --- pandas/io/parsers.py | 2 +- pandas/io/tests/test_parsers.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3b2fedd65b45f..fe07c37a49f36 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -662,7 +662,7 @@ def _get_index_name(self, columns=None): index_name = None elif np.isscalar(self.index_col): if isinstance(self.index_col, basestring): - index_names = self.index_col + index_name = self.index_col for i, c in enumerate(list(columns)): if c == self.index_col: self.index_col = i diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 1f14df528af8f..84a3b165a0af8 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -334,6 +334,23 @@ def test_index_col_named(self): self.assertRaises(ValueError, read_csv, StringIO(no_header), index_col='ID') + data = """\ +1,2,3,4,hello +5,6,7,8,world +9,10,11,12,foo +""" + names = ['a', 'b', 'c', 'd', 'message'] + xp = DataFrame({'a' : [1, 5, 9], 'b' : [2, 6, 10], 'c' : [3, 7, 11], + 'd' : [4, 8, 12]}, + index=Index(['hello', 'world', 'foo'], name='message')) + rs = read_csv(StringIO(data), names=names, index_col=['message']) + assert_frame_equal(xp, rs) + self.assert_(xp.index.name == rs.index.name) + + rs = read_csv(StringIO(data), names=names, index_col='message') + assert_frame_equal(xp, rs) + self.assert_(xp.index.name == rs.index.name) + def test_multiple_skts_example(self): data = "year, month, a, b\n 2001, 01, 0.0, 10.\n 2001, 02, 1.1, 11." pass