diff --git a/.travis.yml b/.travis.yml index e8a9193f9b7c9..b48f6d834b62d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,20 +2,29 @@ language: python python: - 2.6 - - 3.3 env: global: - - NOSE_ARGS="not slow" # need at least this so travis page will show env column + - secure: "O04RU5QRKEDL/SrIWEsVe8O+1TxZqZQSa28Sd+Fz48NW/XddhefYyxzqcUXh\nk/NjWMqknJRQhApLolBianVpsE577OTllzlcyKn3nUL6hjOXcoszGaYray7S\niNGKGyO8xrtB/ZQDtmupz0ksK8sLoCTscdiGotFulczbx0zt+4g=" + - secure: "PUJ9nC1/v2vpFUtELSoSjI53OHCVXfFTb8+t5lIGIqHtjUBkhiJSNPfCv8Bx\ndsdrx30qP8KsSceYzaa/bog6p8YNU1iih23S0KbjucutvA0LNHBTNvnxmjBR\nSJfKd5FmwnXvizRyghYBzmQ3NmGO7ADw2DBwKOhgGMqCHZ8Tlc8=" + - secure: "IDcMrCCW+6pgJtsI3Q163OPc0iec1ogpitaqiRhHcrEBUCXZgVeclOeiZBlw\n/u+uGyW/O0NhHMaFXKB8BdDVwlQEEHv48syN6npS/A5+O6jriWKL4ozttOhE\npOlu+yLhHnEwx6wZVIHRTVn+t1GkOrjlBcjaQi+Z13G3XmDaSG8=" + - secure: "Zu9aj0dTGpvMqT/HqBGQgDYl/v5ubC7lFwfE8Fqb0N1UVXqbpjXnNH/7oal1\nUsIT7klO++LWm+LxsP/A1FWENTSgdYe99JQtNyauW+0x5YR1JTuDJ8atDgx9\nSq66CaVpS5t+ov7UVm2bKSUX+1S8+8zGbIDADrMxEzYEMF7WoGM=" + - secure: "AfIvLxvCxj22zrqg3ejGf/VePKT2AyGT9erYzlKpBS0H8yi5Pp1MfmJjhaR4\n51zBtzqHPHiIEY6ZdE06o9PioMWkXS+BqJNrxGSbt1ltxgOFrxW5zOpwiFGZ\nZOv1YeFkuPf8PEsWT7615mdydqTQT7B0pqUKK/d6aka4TQ/tg5Q=" + - secure: "EM4ySBUusReNu7H1QHXvjnP/J1QowvfpwEBmjysYxJuq7KcG8HhhlfpUF+Gh\nLBzLak9QBA67k4edhum3qtKuJR5cHuja3+zuV8xmx096B/m96liJFTrwZpea\n58op3W6ZULctEpQNgIkyae20bjxl4f99JhZRUlonoPfx/rBIMFc=" + - secure: "pgMYS/6MQqDGb58qdzTJesvAMmcJWTUEEM8gf9rVbfqfxceOL4Xpx8siR9B2\nC4U4MW1cHMPP3RFEb4Jy0uK49aHH10snwZY1S84YPPllpH5ZFXVdN68OayNj\nh4k5N/2hhaaQuJ6Uh8v8s783ye4oYTOW5RJUFqQu4QdG4IkTIMs=" + + - NOSE_ARGS="not slow" UPLOAD=true matrix: include: - python: 2.7 - env: NOSE_ARGS="not network" LOCALE_OVERRIDE="zh_CN.GB18030" + env: NOSE_ARGS="slow and not network" LOCALE_OVERRIDE="zh_CN.GB18030" FULL_DEPS=true UPLOAD=false - python: 2.7 - env: NOSE_ARGS="not slow" FULL_DEPS=true + env: NOSE_ARGS="not slow" FULL_DEPS=true UPLOAD=true - python: 3.2 - env: NOSE_ARGS="not slow" FULL_DEPS=true + env: NOSE_ARGS="not slow" FULL_DEPS=true UPLOAD=true + - python: 3.3 + env: NOSE_ARGS="not slow" UPLOAD=true # allow importing from site-packages, # so apt-get python-x works for system pythons @@ -26,11 +35,14 @@ virtualenv: before_install: - echo "Waldo1" - echo $VIRTUAL_ENV + - df - date - - export PIP_ARGS=-q # comment this this to debug travis install issues - - export APT_ARGS=-qq # comment this to debug travis install issues + # - export PIP_ARGS=-q # comment this this to debug travis install issues + # - export APT_ARGS=-qq # comment this to debug travis install issues # - set -x # enable this to see bash commands - - source ci/before_install.sh # we need to source this to bring in the env + - export ZIP_FLAGS=-q # comment this to debug travis install issues + - source ci/envars.sh # we need to source this to bring in the envars + - ci/before_install.sh - python -V install: @@ -43,3 +55,4 @@ script: after_script: - ci/print_versions.py + - ci/after_script.sh diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6af9924458add..8de89d2e05033 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -18,7 +18,7 @@ your contribution or address the issue you're having. - When submitting a Pull Request - **Make sure the test suite passes**., and that means on python3 as well. - You can use "test_fast.sh", or tox locally and/or enable Travis-CI on your fork. + You can use "test_fast.sh", or tox locally and/or [enable Travis-CI](http://about.travis-ci.org/docs/user/getting-started/) on your fork. - We suggest you enable Travis-CI on your fork, to make it easier for the team to see that the PR does indeed pass all the tests. - Back-compatiblitiy **really** matters. Pandas already has a large user-base and @@ -47,6 +47,16 @@ your contribution or address the issue you're having. and finally a commit message body if there's a need for one. - Please reference the GH issue number in your commit message using GH1234 or #1234, either style is fine. + - Use "raise AssertionError" rather then plain `assert` in library code (using assert is fine + for test code). python -o strips assertions. better safe then sorry. + - When writing tests, don't use "new" assertion methods added to the unittest module + in 2.7 since pandas currently supports 2.6. The most common pitfall is: + + with self.assertRaises(ValueError): + foo + + which fails on python 2.6, use `self.assertRaises(TheException,func,args)` instead. + - RELEASE.rst and doc/source/vx.y.z.txt contain an on-going changelog for each release as it is worked on. Add entries to these files as needed in a separate commit in your PR, documenting the fix, enhancement or (unavoidable) @@ -63,6 +73,7 @@ your contribution or address the issue you're having. - If your code changes are intermixed with style fixes, they are harder to review before merging. Keep style fixes in separate commits. - it's fine to clean-up a little around an area you just worked on. + - Generally its a BAD idea to PEP8 on documentation. Having said that, if you still feel a PEP8 storm is in order, go for it. diff --git a/RELEASE.rst b/RELEASE.rst index 610e9254289aa..12c3ebd2a924b 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -26,7 +26,7 @@ Where to get it pandas 0.11.0 ============= -**Release date:** 2013-??-?? +**Release date:** 2013-04-22 **New features** @@ -173,13 +173,23 @@ pandas 0.11.0 when invalid shapes are passed - Don't suppress TypeError in GroupBy.agg (GH3238_) - Methods return None when inplace=True (GH1893_) - + - HTML repr output for dataframs is once again controlled by the option + `display.notebook_repr_html`, and on by default. - ``HDFStore`` - added the method ``select_column`` to select a single column from a table as a Series. - deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()`` + - ``min_itemsize`` parameter will now automatically create data_columns for passed keys - Downcast on pivot if possible (GH3283_), adds argument ``downcast`` to ``fillna`` + - Introduced options `display.height/width` for explicitly specifying terminal + height/width in characters. Deprecated display.line_width, now replaced by display.width. + These defaults are in effect for scripts as well, so unless disabled, previously + very wide output will now be output as "expand_repr" style wrapped output. + - Various defaults for options (including display.max_rows) have been revised, + after a brief survey concluded they were wrong for everyone. Now at w=80,h=60. + - HTML repr output for dataframes is once again controlled by the option + `display.notebook_repr_html`, and on by default. **Bug Fixes** @@ -296,12 +306,21 @@ pandas 0.11.0 - Ensure index casts work even in Int64Index - Fix set_index segfault when passing MultiIndex (GH3308_) - Ensure pickles created in py2 can be read in py3 + - Insert ellipsis in MultiIndex summary repr (GH3348_) + - Groupby will handle mutation among an input groups columns (and fallback + to non-fast apply) (GH3380_) + - Eliminated unicode errors on FreeBSD when using MPL GTK backend (GH3360_) + - Period.strftime should return unicode strings always (GH3363_) + - Respect passed read_* chunksize in get_chunk function (GH3406_) .. _GH3294: https://github.com/pydata/pandas/issues/3294 .. _GH622: https://github.com/pydata/pandas/issues/622 +.. _GH3348: https://github.com/pydata/pandas/issues/3348 .. _GH797: https://github.com/pydata/pandas/issues/797 .. _GH1893: https://github.com/pydata/pandas/issues/1893 .. _GH1978: https://github.com/pydata/pandas/issues/1978 +.. _GH3360: https://github.com/pydata/pandas/issues/3360 +.. _GH3363: https://github.com/pydata/pandas/issues/3363 .. _GH2758: https://github.com/pydata/pandas/issues/2758 .. _GH3275: https://github.com/pydata/pandas/issues/3275 .. _GH2121: https://github.com/pydata/pandas/issues/2121 @@ -406,6 +425,8 @@ pandas 0.11.0 .. _GH2919: https://github.com/pydata/pandas/issues/2919 .. _GH3308: https://github.com/pydata/pandas/issues/3308 .. _GH3311: https://github.com/pydata/pandas/issues/3311 +.. _GH3380: https://github.com/pydata/pandas/issues/3380 +.. _GH3406: https://github.com/pydata/pandas/issues/3406 pandas 0.10.1 ============= diff --git a/ci/before_install.sh b/ci/before_install.sh index 9561c713d0f2e..677ddfa642f80 100755 --- a/ci/before_install.sh +++ b/ci/before_install.sh @@ -1,28 +1,36 @@ #!/bin/bash +# If envars.sh determined we're running in an authorized fork +# and the user opted in to the network cache,and that cached versions +# are available on the cache server, download and deploy the cached +# files to the local filesystem + echo "inside $0" # overview -if [ ${TRAVIS_PYTHON_VERSION} == "3.3" ]; then - sudo add-apt-repository -y ppa:doko/ppa # we get the py3.3 debs from here -fi - sudo apt-get update $APT_ARGS # run apt-get update for all versions -# # hack for broken 3.3 env -# if [ x"$VIRTUAL_ENV" == x"" ]; then -# VIRTUAL_ENV=~/virtualenv/python$TRAVIS_PYTHON_VERSION_with_system_site_packages; -# fi +if $PLEASE_TRAVIS_FASTER ; then + echo "Faster? well... I'll try." + + if $CACHE_FILE_AVAILABLE ; then + echo retrieving "$CACHE_FILE_URL"; + + wget -q "$CACHE_FILE_URL" -O "/tmp/_$CYTHON_HASH.zip"; + unzip $ZIP_FLAGS /tmp/_"$CYTHON_HASH.zip" -d "$BUILD_CACHE_DIR"; + rm -f /tmp/_"$CYTHON_HASH.zip" + # copy cythonized c files over + cp -R "$BUILD_CACHE_DIR"/pandas/*.c pandas/ + cp -R "$BUILD_CACHE_DIR"/pandas/src/*.c pandas/src/ + fi; + echo "VENV_FILE_AVAILABLE=$VENV_FILE_AVAILABLE" + if $VENV_FILE_AVAILABLE ; then + echo "getting venv" + wget -q $VENV_FILE_URL -O "/tmp/venv.zip"; + sudo unzip $ZIP_FLAGS -o /tmp/venv.zip -d "/"; + sudo chown travis -R "$VIRTUAL_ENV" + rm -f /tmp/_"$CYTHON_HASH.zip" + fi; +fi -# # we only recreate the virtualenv for 3.x -# # since the "Detach bug" only affects python3 -# # and travis has numpy preinstalled on 2.x which is quicker -# _VENV=$VIRTUAL_ENV # save it -# if [ ${TRAVIS_PYTHON_VERSION:0:1} == "3" ] ; then -# deactivate # pop out of any venv -# sudo pip install virtualenv==1.8.4 --upgrade -# sudo apt-get install $APT_ARGS python3.3 python3.3-dev -# sudo rm -Rf $_VENV -# virtualenv -p python$TRAVIS_PYTHON_VERSION $_VENV --system-site-packages; -# source $_VENV/bin/activate -# fi +true # never fail because bad things happened here diff --git a/ci/envars.sh b/ci/envars.sh new file mode 100755 index 0000000000000..2b4cacfd96fe4 --- /dev/null +++ b/ci/envars.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +# This must be sourced by .travis.yml, so any envars exported here will +# be available to the rest of the build stages + +# - computes a hash based on the cython files in the codebade +# - retrieves the decrypted key if any for all whitelisted forks +# - checks whether the user optd int to use the cache +# - if so, check for availablity of cache files on the server, based on hash +# - set envars to control what the following scripts do + +# at most one of these will decrypt, so the end result is that $STORE_KEY +# either holds a single key or does not +export STORE_KEY="$STORE_KEY0""$STORE_KEY1""$STORE_KEY2""$STORE_KEY3""$STORE_KEY4" +export STORE_KEY="$STORE_KEY""$STORE_KEY5""$STORE_KEY6""$STORE_KEY7" + +export CYTHON_HASH=$(find pandas | grep -P '\.(pyx|pxd)$' | sort \ + | while read N; do echo $(tail -n+1 $N | md5sum ) ;done | md5sum| cut -d ' ' -f 1) + +export CYTHON_HASH=$CYTHON_HASH-$TRAVIS_PYTHON_VERSION + +# where the cache files live on the server +export CACHE_FILE_URL="https://cache27-pypandas.rhcloud.com/static/$STORE_KEY/$CYTHON_HASH.zip" +export VENV_FILE_URL="https://cache27-pypandas.rhcloud.com/static/$STORE_KEY/venv-$TRAVIS_PYTHON_VERSION.zip" +export CACHE_FILE_STORE_URL="https://cache27-pypandas.rhcloud.com/store/$STORE_KEY" + +echo "Hashing:" +find pandas | grep -P '\.(pyx|pxd)$' +echo "Key: $CYTHON_HASH" + +export CACHE_FILE_AVAILABLE=false +export VENV_FILE_AVAILABLE=false +export PLEASE_TRAVIS_FASTER=false + +# check whether the user opted in to use the cache via commit message +if [ x"$(git log --format='%B' -n 1 | grep PLEASE_TRAVIS_FASTER | wc -l)" != x"0" ]; then + export PLEASE_TRAVIS_FASTER=true +fi; +if [ x"$(git log --format='%B' -n 1 | grep PTF | wc -l)" != x"0" ]; then + export PLEASE_TRAVIS_FASTER=true +fi; + +if $PLEASE_TRAVIS_FASTER; then + + # check whether the files exists on the server + curl -s -f -I "$CACHE_FILE_URL" # silent, don;t expose key + if [ x"$?" == x"0" ] ; then + export CACHE_FILE_AVAILABLE=true; + fi + + + curl -s -f -I "$VENV_FILE_URL" # silent, don;t expose key + if [ x"$?" == x"0" ] ; then + export VENV_FILE_AVAILABLE=true; + fi + + # the pandas build cache machinery needs this set, and the directory created + export BUILD_CACHE_DIR="/tmp/build_cache" + mkdir "$BUILD_CACHE_DIR" +fi; + +# debug +echo "PLEASE_TRAVIS_FASTER=$PLEASE_TRAVIS_FASTER" +echo "CACHE_FILE_AVAILABLE=$CACHE_FILE_AVAILABLE" +echo "VENV_FILE_AVAILABLE=$VENV_FILE_AVAILABLE" + +true diff --git a/ci/install.sh b/ci/install.sh index 7fe425e055589..8d9ab3aac3374 100755 --- a/ci/install.sh +++ b/ci/install.sh @@ -1,75 +1,130 @@ #!/bin/bash +# There are 2 distinct pieces that get zipped and cached +# - The venv site-packages dir including the installed dependencies +# - The pandas build artifacts, using the build cache support via +# scripts/use_build_cache.py +# +# if the user opted in to use the cache and we're on a whitelisted fork +# - if the server doesn't hold a cached version of venv/pandas build, +# do things the slow way, and put the results on the cache server +# for the next time. +# - if the cache files are available, instal some necessaries via apt +# (no compiling needed), then directly goto script and collect 200$. +# + echo "inside $0" -# Install Dependencies +# Install Dependencie +SITE_PKG_DIR=$VIRTUAL_ENV/lib/python$TRAVIS_PYTHON_VERSION/site-packages +echo "Using SITE_PKG_DIR: $SITE_PKG_DIR" # workaround for travis ignoring system_site_packages in travis.yml rm -f $VIRTUAL_ENV/lib/python$TRAVIS_PYTHON_VERSION/no-global-site-packages.txt -# Hard Deps -pip install $PIP_ARGS --use-mirrors cython nose python-dateutil pytz - -# try and get numpy as a binary deb +if [ x"$LOCALE_OVERRIDE" != x"" ]; then + # make sure the locale is available + # probably useless, since you would need to relogin + sudo locale-gen "$LOCALE_OVERRIDE" +fi; -# numpy is preinstalled on 2.x -# if [ ${TRAVIS_PYTHON_VERSION} == "2.7" ]; then -# sudo apt-get $APT_ARGS install python-numpy; -# fi +#scipy is not included in the cached venv +if [ x"$FULL_DEPS" == x"true" ] ; then + # for pytables gets the lib as well + sudo apt-get $APT_ARGS install libhdf5-serial-dev; -if [ ${TRAVIS_PYTHON_VERSION} == "3.2" ]; then - sudo apt-get $APT_ARGS install python3-numpy; -elif [ ${TRAVIS_PYTHON_VERSION} == "3.3" ]; then # should be >=3,3 - pip $PIP_ARGS install numpy==1.7.0; -else - pip $PIP_ARGS install numpy==1.6.1; + if [ ${TRAVIS_PYTHON_VERSION} == "3.2" ]; then + sudo apt-get $APT_ARGS install python3-scipy + elif [ ${TRAVIS_PYTHON_VERSION} == "2.7" ]; then + sudo apt-get $APT_ARGS install python-scipy + fi fi -# Optional Deps -if [ x"$FULL_DEPS" == x"true" ]; then - echo "Installing FULL_DEPS" - if [ ${TRAVIS_PYTHON_VERSION} == "2.7" ]; then - sudo apt-get $APT_ARGS install python-scipy; - fi +# Everything installed inside this clause into site-packages +# will get included in the cached venv downloaded from the net +# in PTF mode +if ( ! $VENV_FILE_AVAILABLE ); then + echo "Running full monty" + # Hard Deps + pip install $PIP_ARGS nose python-dateutil pytz + pip install $PIP_ARGS cython - if [ ${TRAVIS_PYTHON_VERSION} == "3.2" ]; then - sudo apt-get $APT_ARGS install python3-scipy; + if [ ${TRAVIS_PYTHON_VERSION} == "3.3" ]; then # should be >=3,3 + pip install $PIP_ARGS numpy==1.7.0 + elif [ ${TRAVIS_PYTHON_VERSION} == "3.2" ]; then + # sudo apt-get $APT_ARGS install python3-numpy; # 1.6.2 or precise + pip install $PIP_ARGS numpy==1.6.1 + else + pip install $PIP_ARGS numpy==1.6.1 fi - if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then - sudo apt-get $APT_ARGS install libhdf5-serial-dev; - pip install numexpr - pip install tables + # Optional Deps + if [ x"$FULL_DEPS" == x"true" ]; then + echo "Installing FULL_DEPS" + pip install $PIP_ARGS cython + + if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then + # installed explicitly above, to get the library as well + # sudo apt-get $APT_ARGS install libhdf5-serial-dev; + pip install numexpr + pip install tables + pip install $PIP_ARGS xlwt + fi + + pip install $PIP_ARGS matplotlib + pip install $PIP_ARGS openpyxl + pip install $PIP_ARGS xlrd>=0.9.0 + pip install $PIP_ARGS 'http://downloads.sourceforge.net/project/pytseries/scikits.timeseries/0.91.3/scikits.timeseries-0.91.3.tar.gz?r=' + pip install $PIP_ARGS patsy + + # fool statsmodels into thinking pandas was already installed + # so it won't refuse to install itself. We want it in the zipped venv + + mkdir $SITE_PKG_DIR/pandas + touch $SITE_PKG_DIR/pandas/__init__.py + echo "version='0.10.0-phony'" > $SITE_PKG_DIR/pandas/version.py + pip install $PIP_ARGS git+git://github.com/statsmodels/statsmodels@c9062e43b8a5f7385537ca95#egg=statsmodels + + rm -Rf $SITE_PKG_DIR/pandas # scrub phoney pandas fi - pip install $PIP_ARGS --use-mirrors openpyxl matplotlib; - pip install $PIP_ARGS --use-mirrors xlrd xlwt; - pip install $PIP_ARGS 'http://downloads.sourceforge.net/project/pytseries/scikits.timeseries/0.91.3/scikits.timeseries-0.91.3.tar.gz?r=' -fi + # pack up the venv and cache it + if [ x"$STORE_KEY" != x"" ] && $UPLOAD && $PLEASE_TRAVIS_FASTER ; then + VENV_FNAME="venv-$TRAVIS_PYTHON_VERSION.zip" -if [ x"$VBENCH" == x"true" ]; then - if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then - sudo apt-get $APT_ARGS install libhdf5-serial-dev; - pip install numexpr - pip install tables + zip $ZIP_FLAGS -r "$HOME/$VENV_FNAME" $SITE_PKG_DIR/ + ls -l "$HOME/$VENV_FNAME" + echo "posting venv" + # silent, don't expose key + curl -s --form upload=@"$HOME/$VENV_FNAME" "$CACHE_FILE_STORE_URL/$VENV_FNAME" fi - pip $PIP_ARGS install sqlalchemy git+git://github.com/pydata/vbench.git; -fi -#build and install pandas -python setup.py build_ext install - -#HACK: pandas is a statsmodels dependency -# so we need to install it after pandas -if [ x"$FULL_DEPS" == x"true" ]; then - pip install patsy - # pick recent 0.5dev dec/2012 - pip install git+git://github.com/statsmodels/statsmodels@c9062e43b8a5f7385537ca95#egg=statsmodels fi; -# make sure the desired locale is generated -if [ x"$LOCALE_OVERRIDE" != x"" ]; then - # piggyback this build for plotting tests. oh boy. - pip install $PIP_ARGS --use-mirrors matplotlib; +#build and install pandas +if [ x"$BUILD_CACHE_DIR" != x"" ]; then + scripts/use_build_cache.py -d + python setup.py install; +else + python setup.py build_ext install +fi - sudo locale-gen "$LOCALE_OVERRIDE" +# package pandas build artifacts and send them home +# that's everything the build cache (scripts/use_build_cache.py) +# stored during the build (.so, pyx->.c and 2to3) +if (! $CACHE_FILE_AVAILABLE) ; then + if [ x"$STORE_KEY" != x"" ] && $UPLOAD && $PLEASE_TRAVIS_FASTER ; then + echo "Posting artifacts" + strip "$BUILD_CACHE_DIR/*" &> /dev/null + echo "$BUILD_CACHE_DIR" + cd "$BUILD_CACHE_DIR"/ + zip -r $ZIP_FLAGS "$HOME/$CYTHON_HASH".zip * + cd "$TRAVIS_BUILD_DIR" + pwd + zip "$HOME/$CYTHON_HASH".zip $(find pandas | grep -P '\.(pyx|pxd)$' | sed -r 's/.(pyx|pxd)$/.c/') + + # silent, don't expose key + curl --connect-timeout 5 -s --form upload=@"$HOME/$CYTHON_HASH".zip "$CACHE_FILE_STORE_URL/$CYTHON_HASH.zip" + fi fi + +true diff --git a/ci/print_versions.py b/ci/print_versions.py index bbd7980bc0d69..99aafce48bff4 100755 --- a/ci/print_versions.py +++ b/ci/print_versions.py @@ -4,7 +4,16 @@ print("\nINSTALLED VERSIONS") print("------------------") print("Python: %d.%d.%d.%s.%s" % sys.version_info[:]) +try: + import os + (sysname, nodename, release, version, machine) = os.uname() + print("OS: %s %s %s %s" % (sysname, release, version,machine)) + print("LC_ALL: %s" % os.environ.get('LC_ALL',"None")) + print("LANG: %s" % os.environ.get('LANG',"None")) +except: + pass +print("") try: import Cython print("Cython: %s" % Cython.__version__) diff --git a/doc/source/10min.rst b/doc/source/10min.rst index 9a3dc5f37934a..2eda474d7954f 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -121,8 +121,14 @@ Sorting by values Selection --------- -See the :ref:`Indexing section ` +.. note:: + While standard Python / Numpy expressions for selecting and setting are + intuitive and come in handy for interactive work, for production code, we + recommend the optimized pandas data access methods, ``.at``, ``.iat``, + ``.loc``, ``.iloc`` and ``.ix``. + +See the :ref:`Indexing section ` and below. Getting ~~~~~~~ @@ -230,7 +236,8 @@ For getting fast access to a scalar (equiv to the prior method) df.iat[1,1] There is one signficant departure from standard python/numpy slicing semantics. -python/numpy allow slicing past the end of an array without an associated error. +python/numpy allow slicing past the end of an array without an associated +error. .. ipython:: python @@ -239,7 +246,8 @@ python/numpy allow slicing past the end of an array without an associated error. x[4:10] x[8:10] -Pandas will detect this and raise ``IndexError``, rather than return an empty structure. +Pandas will detect this and raise ``IndexError``, rather than return an empty +structure. :: @@ -306,11 +314,13 @@ A ``where`` operation with setting. df2[df2 > 0] = -df2 df2 + Missing Data ------------ -Pandas primarily uses the value ``np.nan`` to represent missing data. It -is by default not included in computations. See the :ref:`Missing Data section ` +Pandas primarily uses the value ``np.nan`` to represent missing data. It is by +default not included in computations. See the :ref:`Missing Data section +` Reindexing allows you to change/add/delete the index on a specified axis. This returns a copy of the data. @@ -457,8 +467,8 @@ Append rows to a dataframe. See the :ref:`Appending ` Grouping -------- -By "group by" we are referring to a process involving one or more of the following -steps +By "group by" we are referring to a process involving one or more of the +following steps - **Splitting** the data into groups based on some criteria - **Applying** a function to each group independently @@ -481,7 +491,8 @@ Grouping and then applying a function ``sum`` to the resulting groups. df.groupby('A').sum() -Grouping by multiple columns forms a hierarchical index, which we then apply the function. +Grouping by multiple columns forms a hierarchical index, which we then apply +the function. .. ipython:: python @@ -547,10 +558,10 @@ We can produce pivot tables from this data very easily: Time Series ----------- -Pandas has simple, powerful, and efficient functionality for -performing resampling operations during frequency conversion (e.g., converting -secondly data into 5-minutely data). This is extremely common in, but not -limited to, financial applications. See the :ref:`Time Series section ` +Pandas has simple, powerful, and efficient functionality for performing +resampling operations during frequency conversion (e.g., converting secondly +data into 5-minutely data). This is extremely common in, but not limited to, +financial applications. See the :ref:`Time Series section ` .. ipython:: python diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index db311e9be9ecb..338963abd24e3 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -151,6 +151,10 @@ Timeseries `Vectorized Lookup `__ +Turn a matrix with hours in columns and days in rows into a continous row sequence in the form of a time series. +`How to rearrange a python pandas dataframe? +`__ + .. _cookbook.resample: Resampling @@ -233,10 +237,6 @@ The :ref:`CSV ` docs `Dealing with bad lines `__ -Turn a matrix with hours in columns and days in rows into a continous row sequence in the form of a time series. -`How to rearrange a python pandas dataframe? -`__ - .. _cookbook.sql: SQL @@ -273,6 +273,11 @@ The :ref:`HDFStores ` docs `Merging on-disk tables with millions of rows `__ +Deduplicating a large store by chunks, essentially a recusive reduction operation. Shows a function for taking in data from +csv file and creating a store by chunks, with date parsing as well. +`See here +`__ + `Large Data work flows `__ @@ -282,6 +287,9 @@ The :ref:`HDFStores ` docs `Troubleshoot HDFStore exceptions `__ +`Setting min_itemsize with strings +`__ + Storing Attributes to a group node .. ipython:: python diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index be15260c8151c..f0ca08d22d7dc 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -173,7 +173,7 @@ the length of the ``groups`` dict, so it is largely just a convenience: len(grouped) By default the group keys are sorted during the groupby operation. You may -however pass ``sort``=``False`` for potential speedups: +however pass ``sort=False`` for potential speedups: .. ipython:: python diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 853de3ee37ca2..eee14cffb46be 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -32,6 +32,19 @@ attention in this area. Expect more work to be invested higher-dimensional data structures (including Panel) in the future, especially in label-based advanced indexing. +.. note:: + + The Python and NumPy indexing operators ``[]`` and attribute operator ``.`` provide quick and easy access to pandas data structures + across a wide range of use cases. This makes interactive work intuitive, as + there's little new to learn if you already know how to deal with Python + dictionaries and NumPy arrays. However, since the type of the data to be accessed + isn't known in advance, directly using + standard operators has some optimization limits. For production code, we recommended + that you take advantage of the optimized pandas data access methods exposed in this chapter. + + In addition, whether a copy or a reference is returned for a selection operation, may depend on the context. + See :ref:`Returning a View versus Copy ` + See the :ref:`cookbook` for some advanced strategies Choice @@ -41,46 +54,38 @@ Starting in 0.11.0, object selection has had a number of user-requested addition order to support more explicit location based indexing. Pandas now supports three types of multi-axis indexing. - - ``.loc`` is strictly label based, will raise ``KeyError`` when the items are not found, - allowed inputs are: - - - A single label, e.g. ``5`` or ``'a'`` - - (note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index) - - A list or array of labels ``['a', 'b', 'c']`` - - A slice object with labels ``'a':'f'`` +- ``.loc`` is strictly label based, will raise ``KeyError`` when the items are not found, allowed inputs are: - (note that contrary to usual python slices, **both** the start and the stop are included!) - - A boolean array - - See more at :ref:`Selection by Label ` + - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index) + - A list or array of labels ``['a', 'b', 'c']`` + - A slice object with labels ``'a':'f'``, (note that contrary to usual python slices, **both** the start and the stop are included!) + - A boolean array - - ``.iloc`` is strictly integer position based (from 0 to length-1 of the axis), will - raise ``IndexError`` when the requested indicies are out of bounds. Allowed inputs are: + See more at :ref:`Selection by Label ` - - An integer e.g. ``5`` - - A list or array of integers ``[4, 3, 0]`` - - A slice object with ints ``1:7`` - - A boolean array +- ``.iloc`` is strictly integer position based (from ``0`` to ``length-1`` of the axis), will raise ``IndexError`` when the requested indicies are out of bounds. Allowed inputs are: - See more at :ref:`Selection by Position ` + - An integer e.g. ``5`` + - A list or array of integers ``[4, 3, 0]`` + - A slice object with ints ``1:7`` + - A boolean array - - ``.ix`` supports mixed integer and label based access. It is primarily label based, but - will fallback to integer positional access. ``.ix`` is the most general and will support - any of the inputs to ``.loc`` and ``.iloc``, as well as support for floating point label schemes. + See more at :ref:`Selection by Position ` - As using integer slices with ``.ix`` have different behavior depending on whether the slice - is interpreted as integer location based or label position based, it's usually better to be - explicit and use ``.iloc`` (integer location) or ``.loc`` (label location). +- ``.ix`` supports mixed integer and label based access. It is primarily label based, but will fallback to integer positional access. ``.ix`` is the most general + and will support any of the inputs to ``.loc`` and ``.iloc``, as well as support for floating point label schemes. ``.ix`` is especially useful when dealing with mixed positional and label + based hierarchial indexes. - ``.ix`` is especially useful when dealing with mixed positional and label based hierarchial indexes. + As using integer slices with ``.ix`` have different behavior depending on whether the slice is interpreted as position based or label based, it's + usually better to be explicit and use ``.iloc`` or ``.loc``. - See more at :ref:`Advanced Indexing ` and :ref:`Advanced Hierarchical ` + See more at :ref:`Advanced Indexing `, :ref:`Advanced Hierarchical ` and :ref:`Fallback Indexing ` -Getting values from an object with multi-axes selection uses the following notation (using ``.loc`` as an -example, but applies to ``.iloc`` and ``.ix`` as well) Any of the axes accessors may be the null -slice ``:``. Axes left out of the specification are assumed to be ``:``. -(e.g. ``p.loc['a']`` is equiv to ``p.loc['a',:,:]``) +Getting values from an object with multi-axes selection uses the following +notation (using ``.loc`` as an example, but applies to ``.iloc`` and ``.ix`` as +well). Any of the axes accessors may be the null slice ``:``. Axes left out of +the specification are assumed to be ``:``. (e.g. ``p.loc['a']`` is equiv to +``p.loc['a',:,:]``) .. csv-table:: :header: "Object Type", "Indexers" @@ -94,7 +99,7 @@ slice ``:``. Axes left out of the specification are assumed to be ``:``. Deprecations ~~~~~~~~~~~~ -Starting in version 0.11.0, these methods may be deprecated in future versions. +Starting in version 0.11.0, these methods *may* be deprecated in future versions. - ``irow`` - ``icol`` @@ -102,16 +107,6 @@ Starting in version 0.11.0, these methods may be deprecated in future versions. See the section :ref:`Selection by Position ` for substitutes. -.. _indexing.xs: - -Cross-sectional slices on non-hierarchical indices are now easily performed using -``.loc`` and/or ``.iloc``. These methods now exist primarily for backward compatibility. - - - ``xs`` (for DataFrame), - - ``minor_xs`` and ``major_xs`` (for Panel) - -See the section at :ref:`Selection by Label ` for substitutes. - .. _indexing.basics: Basics @@ -155,6 +150,19 @@ Thus, as per above, we have the most basic indexing using ``[]``: s[dates[5]] panel['two'] +You can pass a list of columns to ``[]`` to select columns in that order. +If a column is not contained in the DataFrame, an exception will be +raised. Multiple columns can also be set in this manner: + +.. ipython:: python + + df + df[['B', 'A']] = df[['A', 'B']] + df + +You may find this useful for applying a transform (in-place) to a subset of the +columns. + Attribute Access ~~~~~~~~~~~~~~~~ @@ -162,7 +170,8 @@ Attribute Access .. _indexing.df_cols: -You may access a column on a ``DataFrame``, and a item on a ``Panel`` directly as an attribute: +You may access a column on a ``DataFrame``, and a item on a ``Panel`` directly +as an attribute: .. ipython:: python @@ -172,26 +181,12 @@ You may access a column on a ``DataFrame``, and a item on a ``Panel`` directly a If you are using the IPython environment, you may also use tab-completion to see these accessable attributes. -You can pass a list of columns to ``[]`` to select columns in that order: -If a column is not contained in the DataFrame, an exception will be -raised. Multiple columns can also be set in this manner: - -.. ipython:: python - - df - df[['B', 'A']] = df[['A', 'B']] - df - -You may find this useful for applying a transform (in-place) to a subset of the -columns. - Slicing ranges ~~~~~~~~~~~~~~ The most robust and consistent way of slicing ranges along arbitrary axes is -described in the :ref:`Selection by Position ` section detailing -the ``.iloc`` method. For now, we explain the semantics of slicing using the -``[]`` operator. +described in the :ref:`Selection by Position ` section +detailing the ``.iloc`` method. For now, we explain the semantics of slicing using the ``[]`` operator. With Series, the syntax works exactly as with an ndarray, returning a slice of the values and the corresponding labels: @@ -223,23 +218,15 @@ largely as a convenience since it is such a common operation. Selection By Label ~~~~~~~~~~~~~~~~~~ -Pandas provides a suite of methods in order to have **purely label based indexing**. -This is a strict inclusion based protocol. **ALL** of the labels for which you ask, -must be in the index or a ``KeyError`` will be raised! - -When slicing, the start bound is *included*, **AND** the stop bound is *included*. -Integers are valid labels, but they refer to the label *and not the position*. +Pandas provides a suite of methods in order to have **purely label based indexing**. This is a strict inclusion based protocol. +**ALL** of the labels for which you ask, must be in the index or a ``KeyError`` will be raised! When slicing, the start bound is *included*, **AND** the stop bound is *included*. Integers are valid labels, but they refer to the label **and not the position**. The ``.loc`` attribute is the primary access method. The following are valid inputs: - - A single label, e.g. ``5`` or ``'a'`` - - (note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index) - - A list or array of labels ``['a', 'b', 'c']`` - - A slice object with labels ``'a':'f'`` - - (note that contrary to usual python slices, **both** the start and the stop are included!) - - A boolean array +- A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index) +- A list or array of labels ``['a', 'b', 'c']`` +- A slice object with labels ``'a':'f'`` (note that contrary to usual python slices, **both** the start and the stop are included!) +- A boolean array .. ipython:: python @@ -271,7 +258,7 @@ Accessing via label slices df1.loc['d':,'A':'C'] -For getting a cross section using a label (equiv to deprecated ``df.xs('a')``) +For getting a cross section using a label (equiv to ``df.xs('a')``) .. ipython:: python @@ -296,18 +283,14 @@ For getting a value explicity (equiv to deprecated ``df.get_value('a','A')``) Selection By Position ~~~~~~~~~~~~~~~~~~~~~ -Pandas provides a suite of methods in order to get **purely integer based indexing**. -The semantics follow closely python and numpy slicing. These are ``0-based`` indexing. - -When slicing, the start bounds is *included*, while the upper bound is *excluded*. -Trying to use a non-integer, even a **valid** label will raise a ``IndexError``. +Pandas provides a suite of methods in order to get **purely integer based indexing**. The semantics follow closely python and numpy slicing. These are ``0-based`` indexing. When slicing, the start bounds is *included*, while the upper bound is *excluded*. Trying to use a non-integer, even a **valid** label will raise a ``IndexError``. The ``.iloc`` attribute is the primary access method. The following are valid inputs: - - An integer e.g. ``5`` - - A list or array of integers ``[4, 3, 0]`` - - A slice object with ints ``1:7`` - - A boolean array +- An integer e.g. ``5`` +- A list or array of integers ``[4, 3, 0]`` +- A slice object with ints ``1:7`` +- A boolean array .. ipython:: python @@ -370,7 +353,7 @@ For getting a scalar via integer position (equiv to deprecated ``df.get_value(1, # this is also equivalent to ``df1.iat[1,1]`` df1.iloc[1,1] -For getting a cross section using an integer position (equiv to deprecated ``df.xs(1)``) +For getting a cross section using an integer position (equiv to ``df.xs(1)``) .. ipython:: python @@ -401,11 +384,10 @@ Fast scalar value getting and setting Since indexing with ``[]`` must handle a lot of cases (single-label access, slicing, boolean indexing, etc.), it has a bit of overhead in order to figure out what you're asking for. If you only want to access a scalar value, the -fastest way is to use the ``at`` and ``iat`` methods, which are implemented on all of -the data structures. +fastest way is to use the ``at`` and ``iat`` methods, which are implemented on +all of the data structures. -Similary to ``loc``, ``at`` provides **label** based scalar lookups, while, ``iat`` provides -**integer** based lookups analagously to ``iloc`` +Similary to ``loc``, ``at`` provides **label** based scalar lookups, while, ``iat`` provides **integer** based lookups analagously to ``iloc`` .. ipython:: python @@ -413,9 +395,9 @@ Similary to ``loc``, ``at`` provides **label** based scalar lookups, while, ``ia df.at[dates[5], 'A'] df.iat[3, 0] -You can also set using these same indexers. These have the additional capability -of enlarging an object. This method *always* returns a reference to the object -it modified, which in the case of enlargement, will be a **new object**: +You can also set using these same indexers. These have the additional +capability of enlarging an object. This method *always* returns a reference to +the object it modified, which in the case of enlargement, will be a **new object**: .. ipython:: python @@ -428,8 +410,7 @@ Boolean indexing .. _indexing.boolean: Another common operation is the use of boolean vectors to filter the data. -The operators are: ``|`` for ``or``, ``&`` for ``and``, and ``~`` for ``not``. -These are grouped using parentheses. +The operators are: ``|`` for ``or``, ``&`` for ``and``, and ``~`` for ``not``. These **must** be grouped by using parentheses. Using a boolean vector to index a Series works exactly as in a numpy ndarray: @@ -475,22 +456,19 @@ more complex criteria: # Multiple criteria df2[criterion & (df2['b'] == 'x')] -Note, with the choice methods :ref:`Selection by Label `, :ref:`Selection by Position `, -and :ref:`Advanced Indexing ` you may select along more than one axis using boolean vectors combined with other -indexing expressions. +Note, with the choice methods :ref:`Selection by Label `, :ref:`Selection by Position `, +and :ref:`Advanced Indexing ` you may select along more than one axis using boolean vectors combined with other indexing expressions. .. ipython:: python df2.loc[criterion & (df2['b'] == 'x'),'b':'c'] - Where and Masking ~~~~~~~~~~~~~~~~~ -Selecting values from a Series with a boolean vector generally returns a subset of the data. -To guarantee that selection output has the same shape as the original data, you can use the -``where`` method in ``Series`` and ``DataFrame``. - +Selecting values from a Series with a boolean vector generally returns a +subset of the data. To guarantee that selection output has the same shape as +the original data, you can use the ``where`` method in ``Series`` and ``DataFrame``. To return only the selected rows @@ -504,15 +482,16 @@ To return a Series of the same shape as the original s.where(s > 0) -Selecting values from a DataFrame with a boolean critierion now also preserves input data shape. -``where`` is used under the hood as the implementation. Equivalent is ``df.where(df < 0)`` +Selecting values from a DataFrame with a boolean critierion now also preserves +input data shape. ``where`` is used under the hood as the implementation. +Equivalent is ``df.where(df < 0)`` .. ipython:: python df[df < 0] -In addition, ``where`` takes an optional ``other`` argument for replacement of values where the -condition is False, in the returned copy. +In addition, ``where`` takes an optional ``other`` argument for replacement of +values where the condition is False, in the returned copy. .. ipython:: python @@ -531,8 +510,9 @@ This can be done intuitively like so: df2[df2 < 0] = 0 df2 -Furthermore, ``where`` aligns the input boolean condition (ndarray or DataFrame), such that partial selection -with setting is possible. This is analagous to partial setting via ``.ix`` (but on the contents rather than the axis labels) +Furthermore, ``where`` aligns the input boolean condition (ndarray or DataFrame), +such that partial selection with setting is possible. This is analagous to +partial setting via ``.ix`` (but on the contents rather than the axis labels) .. ipython:: python @@ -540,8 +520,9 @@ with setting is possible. This is analagous to partial setting via ``.ix`` (but df2[ df2[1:4] > 0 ] = 3 df2 -By default, ``where`` returns a modified copy of the data. There is an optional parameter ``inplace`` -so that the original data can be modified without creating a copy: +By default, ``where`` returns a modified copy of the data. There is an +optional parameter ``inplace`` so that the original data can be modified +without creating a copy: .. ipython:: python @@ -567,7 +548,7 @@ Take Methods Similar to numpy ndarrays, pandas Index, Series, and DataFrame also provides the ``take`` method that retrieves elements along a given axis at the given indices. The given indices must be either a list or an ndarray of integer -index positions. +index positions. ``take`` will also accept negative integers as relative positions to the end of the object. .. ipython:: python @@ -634,10 +615,8 @@ If you want to identify and remove duplicate rows in a DataFrame, there are two methods that will help: ``duplicated`` and ``drop_duplicates``. Each takes as an argument the columns to use to identify duplicated rows. -``duplicated`` returns a boolean vector whose length is the number of rows, and -which indicates whether a row is duplicated. - -``drop_duplicates`` removes duplicate rows. +- ``duplicated`` returns a boolean vector whose length is the number of rows, and which indicates whether a row is duplicated. +- ``drop_duplicates`` removes duplicate rows. By default, the first observed row of a duplicate set is considered unique, but each method has a ``take_last`` parameter that indicates the last observed row @@ -674,21 +653,22 @@ Advanced Indexing with ``.ix`` .. note:: The recent addition of ``.loc`` and ``.iloc`` have enabled users to be quite - explicit about indexing choices. ``.ix`` allows a great flexibility to specify - indexing locations by *label* and/or *integer position*. Pandas will attempt - to use any passed *integer* as *label* locations first (like what ``.loc`` - would do, then to fall back on *positional* indexing, like what ``.iloc`` - would do). See :ref:`Fallback Indexing ` for an example. + explicit about indexing choices. ``.ix`` allows a great flexibility to + specify indexing locations by *label* and/or *integer position*. Pandas will + attempt to use any passed *integer* as *label* locations first (like what + ``.loc`` would do, then to fall back on *positional* indexing, like what + ``.iloc`` would do). See :ref:`Fallback Indexing ` for + an example. -The syntax of using ``.ix`` is identical to ``.loc``, in :ref:`Selection by Label `, -and ``.iloc`` in :ref:`Selection by Position `. +The syntax of using ``.ix`` is identical to ``.loc``, in :ref:`Selection by +Label `, and ``.iloc`` in :ref:`Selection by Position `. The ``.ix`` attribute takes the following inputs: - - An integer or single label, e.g. ``5`` or ``'a'`` - - A list or array of labels ``['a', 'b', 'c']`` or integers ``[4, 3, 0]`` - - A slice object with ints ``1:7`` or labels ``'a':'f'`` - - A boolean array +- An integer or single label, e.g. ``5`` or ``'a'`` +- A list or array of labels ``['a', 'b', 'c']`` or integers ``[4, 3, 0]`` +- A slice object with ints ``1:7`` or labels ``'a':'f'`` +- A boolean array We'll illustrate all of these methods. First, note that this provides a concise way of reindexing on multiple axes at once: @@ -752,15 +732,6 @@ labels or even boolean vectors: Slicing with labels is closely related to the ``truncate`` method which does precisely ``.ix[start:stop]`` but returns a copy (for legacy reasons). -Returning a view versus a copy -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The rules about when a view on the data is returned are entirely dependent on -NumPy. Whenever an array of labels or a boolean vector are involved in the -indexing operation, the result will be a copy. With single label / scalar -indexing and slicing, e.g. ``df.ix[3:6]`` or ``df.ix[:, 'A']``, a view will be -returned. - The ``select`` method ~~~~~~~~~~~~~~~~~~~~~ @@ -785,14 +756,13 @@ numpy array. For instance, dflookup = DataFrame(np.random.rand(20,4), columns = ['A','B','C','D']) dflookup.lookup(xrange(0,10,2), ['B','C','A','B','D']) - Setting values in mixed-type DataFrame ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. _indexing.mixed_type_setting: -Setting values on a mixed-type DataFrame or Panel is supported when using scalar -values, though setting arbitrary vectors is not yet supported: +Setting values on a mixed-type DataFrame or Panel is supported when using +scalar values, though setting arbitrary vectors is not yet supported: .. ipython:: python @@ -803,6 +773,30 @@ values, though setting arbitrary vectors is not yet supported: print df2 print df2.dtypes +.. _indexing.view_versus_copy: + +Returning a view versus a copy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The rules about when a view on the data is returned are entirely dependent on +NumPy. Whenever an array of labels or a boolean vector are involved in the +indexing operation, the result will be a copy. With single label / scalar +indexing and slicing, e.g. ``df.ix[3:6]`` or ``df.ix[:, 'A']``, a view will be +returned. + +In chained expressions, the order may determine whether a copy is returned or not: + +.. ipython:: python + + + dfb = DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'], + 'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'], + 'c' : randn(7)}) + dfb[dfb.a.str.startswith('o')]['c'] = 42 # goes to copy (will be lost) + dfb['c'][dfb.a.str.startswith('o')] = 42 # passed via reference (will stay) + +When assigning values to subsets of your data, thus, make sure to either use the +pandas access methods or explicitly handle the assignment creating a copy. Fallback indexing ~~~~~~~~~~~~~~~~~~~~ @@ -926,10 +920,10 @@ See the :ref:`cookbook` for some advanced strategies Given that hierarchical indexing is so new to the library, it is definitely "bleeding-edge" functionality but is certainly suitable for production. But, - there may inevitably be some minor API changes as more use cases are explored - and any weaknesses in the design / implementation are identified. pandas aims - to be "eminently usable" so any feedback about new functionality like this is - extremely helpful. + there may inevitably be some minor API changes as more use cases are + explored and any weaknesses in the design / implementation are identified. + pandas aims to be "eminently usable" so any feedback about new + functionality like this is extremely helpful. Creating a MultiIndex (hierarchical index) object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -956,8 +950,10 @@ DataFrame to construct a MultiIndex automatically: .. ipython:: python - arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']), - np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])] + arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']) + , + np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']) + ] s = Series(randn(8), index=arrays) s df = DataFrame(randn(8, 4), index=arrays) @@ -983,8 +979,8 @@ of the index is up to you: We've "sparsified" the higher levels of the indexes to make the console output a bit easier on the eyes. -It's worth keeping in mind that there's nothing preventing you from using tuples -as atomic labels on an axis: +It's worth keeping in mind that there's nothing preventing you from using +tuples as atomic labels on an axis: .. ipython:: python @@ -1025,8 +1021,8 @@ Basic indexing on axis with MultiIndex One of the important features of hierarchical indexing is that you can select data by a "partial" label identifying a subgroup in the data. **Partial** -selection "drops" levels of the hierarchical index in the result in a completely -analogous way to selecting a column in a regular DataFrame: +selection "drops" levels of the hierarchical index in the result in a +completely analogous way to selecting a column in a regular DataFrame: .. ipython:: python @@ -1096,6 +1092,8 @@ but as you use it you may uncover corner cases or unintuitive behavior. If you do find something like this, do not hesitate to report the issue or ask on the mailing list. +.. _indexing.xs: + Cross-section with hierarchical index ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1275,8 +1273,8 @@ indexed DataFrame: indexed2 = data.set_index(['a', 'b']) indexed2 -The ``append`` keyword option allow you to keep the existing index and append the given -columns to a MultiIndex: +The ``append`` keyword option allow you to keep the existing index and append +the given columns to a MultiIndex: .. ipython:: python @@ -1321,7 +1319,8 @@ discards the index, instead of putting index values in the DataFrame's columns. .. note:: - The ``reset_index`` method used to be called ``delevel`` which is now deprecated. + The ``reset_index`` method used to be called ``delevel`` which is now + deprecated. Adding an ad hoc index ~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/io.rst b/doc/source/io.rst index 25c42780afd65..9001ae393d552 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1391,7 +1391,7 @@ of rows in an object. Multiple Table Queries ~~~~~~~~~~~~~~~~~~~~~~ -New in 0.10.1 are the methods ``append_to_multple`` and +New in 0.10.1 are the methods ``append_to_multiple`` and ``select_as_multiple``, that can perform appending/selecting from multiple tables at once. The idea is to have one table (call it the selector table) that you index most/all of the columns, and perform your @@ -1535,24 +1535,6 @@ Notes & Caveats ``tables``. The sizes of a string based indexing column (e.g. *columns* or *minor_axis*) are determined as the maximum size of the elements in that axis or by passing the parameter - ``min_itemsize`` on the first table creation (``min_itemsize`` can - be an integer or a dict of column name to an integer). If - subsequent appends introduce elements in the indexing axis that are - larger than the supported indexer, an Exception will be raised - (otherwise you could have a silent truncation of these indexers, - leading to loss of information). Just to be clear, this fixed-width - restriction applies to **indexables** (the indexing columns) and - **string values** in a mixed_type table. - - .. ipython:: python - - store.append('wp_big_strings', wp, min_itemsize = { 'minor_axis' : 30 }) - wp = wp.rename_axis(lambda x: x + '_big_strings', axis=2) - store.append('wp_big_strings', wp) - store.select('wp_big_strings') - - # we have provided a minimum minor_axis indexable size - store.root.wp_big_strings.table DataTypes ~~~~~~~~~ @@ -1589,6 +1571,34 @@ conversion may not be necessary in future versions of pandas) df df.dtypes +String Columns +~~~~~~~~~~~~~~ + +The underlying implementation of ``HDFStore`` uses a fixed column width (itemsize) for string columns. A string column itemsize is calculated as the maximum of the +length of data (for that column) that is passed to the ``HDFStore``, **in the first append**. Subsequent appends, may introduce a string for a column **larger** than the column can hold, an Exception will be raised (otherwise you could have a silent truncation of these columns, leading to loss of information). In the future we may relax this and allow a user-specified truncation to occur. + +Pass ``min_itemsize`` on the first table creation to a-priori specifiy the minimum length of a particular string column. ``min_itemsize`` can be an integer, or a dict mapping a column name to an integer. You can pass ``values`` as a key to allow all *indexables* or *data_columns* to have this min_itemsize. + +Starting in 0.11, passing a ``min_itemsize`` dict will cause all passed columns to be created as *data_columns* automatically. + +.. note:: + + If you are not passing any *data_columns*, then the min_itemsize will be the maximum of the length of any string passed + +.. ipython:: python + + dfs = DataFrame(dict(A = 'foo', B = 'bar'),index=range(5)) + dfs + + # A and B have a size of 30 + store.append('dfs', dfs, min_itemsize = 30) + store.get_storer('dfs').table + + # A is created as a data_column with a size of 30 + # B is size is calculated + store.append('dfs2', dfs, min_itemsize = { 'A' : 30 }) + store.get_storer('dfs2').table + External Compatibility ~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/rplot.rst b/doc/source/rplot.rst index 7153d5323c805..1f33c789ee3ca 100644 --- a/doc/source/rplot.rst +++ b/doc/source/rplot.rst @@ -22,6 +22,12 @@ Trellis plotting interface ************************** +We import the rplot API: + +.. ipython:: python + + import pandas.tools.rplot as rplot + -------- Examples -------- diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt index 9c0a6d5a421c7..0d425c0043d1e 100644 --- a/doc/source/v0.11.0.txt +++ b/doc/source/v0.11.0.txt @@ -1,6 +1,6 @@ .. _whatsnew_0110: -v0.11.0 (March ??, 2013) +v0.11.0 (April 22, 2013) ------------------------ This is a major release from 0.10.1 and includes many new features and @@ -24,64 +24,47 @@ Starting in 0.11.0, object selection has had a number of user-requested addition order to support more explicit location based indexing. Pandas now supports three types of multi-axis indexing. - - ``.loc`` is strictly label based, will raise ``KeyError`` when the items are not found, - allowed inputs are: +- ``.loc`` is strictly label based, will raise ``KeyError`` when the items are not found, allowed inputs are: - - A single label, e.g. ``5`` or ``'a'`` + - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index) + - A list or array of labels ``['a', 'b', 'c']`` + - A slice object with labels ``'a':'f'``, (note that contrary to usual python slices, **both** the start and the stop are included!) + - A boolean array - (note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index) - - A list or array of labels ``['a', 'b', 'c']`` - - A slice object with labels ``'a':'f'`` + See more at :ref:`Selection by Label ` - (note that contrary to usual python slices, **both** the start and the stop are included!) - - A boolean array +- ``.iloc`` is strictly integer position based (from ``0`` to ``length-1`` of the axis), will raise ``IndexError`` when the requested indicies are out of bounds. Allowed inputs are: - See more at :ref:`Selection by Label ` + - An integer e.g. ``5`` + - A list or array of integers ``[4, 3, 0]`` + - A slice object with ints ``1:7`` + - A boolean array - - ``.iloc`` is strictly integer position based (from 0 to length-1 of the axis), will - raise ``IndexError`` when the requested indicies are out of bounds. Allowed inputs are: + See more at :ref:`Selection by Position ` - - An integer e.g. ``5`` - - A list or array of integers ``[4, 3, 0]`` - - A slice object with ints ``1:7`` - - A boolean array +- ``.ix`` supports mixed integer and label based access. It is primarily label based, but will fallback to integer positional access. ``.ix`` is the most general and will support + any of the inputs to ``.loc`` and ``.iloc``, as well as support for floating point label schemes. ``.ix`` is especially useful when dealing with mixed positional and label + based hierarchial indexes. - See more at :ref:`Selection by Position ` + As using integer slices with ``.ix`` have different behavior depending on whether the slice + is interpreted as position based or label based, it's usually better to be + explicit and use ``.iloc`` or ``.loc``. - - ``.ix`` supports mixed integer and label based access. It is primarily label based, but - will fallback to integer positional access. ``.ix`` is the most general and will support - any of the inputs to ``.loc`` and ``.iloc``, as well as support for floating point label schemes. - - As using integer slices with ``.ix`` have different behavior depending on whether the slice - is interpreted as integer location based or label position based, it's usually better to be - explicit and use ``.iloc`` (integer location) or ``.loc`` (label location). - - ``.ix`` is especially usefull when dealing with mixed positional/label based hierarchial indexes. - - See more at :ref:`Advanced Indexing ` and :ref:`Advanced Hierarchical ` + See more at :ref:`Advanced Indexing `, :ref:`Advanced Hierarchical ` and + :ref:`Fallback Indexing ` Selection Deprecations ~~~~~~~~~~~~~~~~~~~~~~ -Starting in version 0.11.0, these methods may be deprecated in future versions. +Starting in version 0.11.0, these methods *may* be deprecated in future versions. - - ``irow`` - - ``icol`` - - ``iget_value`` +- ``irow`` +- ``icol`` +- ``iget_value`` See the section :ref:`Selection by Position ` for substitutes. -Cross-sectional slices on non-hierarchical indices are now easily performed using -``.loc`` and/or ``.loc``. The methods: - - - ``xs`` (for DataFrame), - - ``minor_xs`` and ``major_xs`` (for Panel) - -now exist primarily for backward compatibility. - -See the section :ref:`Selection by Label ` for substitutes. - Dtypes ~~~~~~ @@ -229,9 +212,11 @@ API changes - Added to_series() method to indicies, to facilitate the creation of indexers (GH3275_) - - In ``HDFStore``, added the method ``select_column`` to select a single column from a table as a Series. + - ``HDFStore`` - - In ``HDFStore``, deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()`` + - added the method ``select_column`` to select a single column from a table as a Series. + - deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()`` + - ``min_itemsize`` parameter to ``append`` will now automatically create data_columns for passed keys Enhancements ~~~~~~~~~~~~ @@ -244,25 +229,26 @@ Enhancements - Bottleneck is now a :ref:`Recommended Dependencies `, to accelerate certain types of ``nan`` operations - - For ``HDFStore``, support ``read_hdf/to_hdf`` API similar to ``read_csv/to_csv`` + - ``HDFStore`` - .. ipython:: python + - support ``read_hdf/to_hdf`` API similar to ``read_csv/to_csv`` - df = DataFrame(dict(A=range(5), B=range(5))) - df.to_hdf('store.h5','table',append=True) - read_hdf('store.h5', 'table', where = ['index>2']) + .. ipython:: python - .. ipython:: python - :suppress: - :okexcept: + df = DataFrame(dict(A=range(5), B=range(5))) + df.to_hdf('store.h5','table',append=True) + read_hdf('store.h5', 'table', where = ['index>2']) - os.remove('store.h5') + .. ipython:: python + :suppress: + :okexcept: - - In ``HDFStore``, provide dotted attribute access to ``get`` from stores - (e.g. ``store.df == store['df']``) + os.remove('store.h5') - - In ``HDFStore``, new keywords ``iterator=boolean``, and ``chunksize=number_in_a_chunk`` are - provided to support iteration on ``select`` and ``select_as_multiple`` (GH3076_) + - provide dotted attribute access to ``get`` from stores, e.g. ``store.df == store['df']`` + + - new keywords ``iterator=boolean``, and ``chunksize=number_in_a_chunk`` are + provided to support iteration on ``select`` and ``select_as_multiple`` (GH3076_) - You can now select timestamps from an *unordered* timeseries similarly to an *ordered* timeseries (GH2437_) @@ -306,7 +292,8 @@ Enhancements only return forward looking data for options near the current stock price. This just obtains the data from Options.get_near_stock_price instead of Options.get_xxx_data() (GH2758_). - + Cursor coordinate information is now displayed in time-series plots. + + - Cursor coordinate information is now displayed in time-series plots. - added option `display.max_seq_items` to control the number of elements printed per sequence pprinting it. (GH2979_) diff --git a/pandas/core/common.py b/pandas/core/common.py index 610477caddba8..01b6dde7d1ecc 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1782,11 +1782,25 @@ def in_qtconsole(): """ try: ip = get_ipython() - if ip.config['KernelApp']['parent_appname'] == 'ipython-qtconsole': + front_end = (ip.config.get('KernelApp',{}).get('parent_appname',"") or + ip.config.get('IPKernelApp',{}).get('parent_appname',"")) + if 'qtconsole' in front_end.lower(): return True except: return False +def in_ipnb_frontend(): + """ + check if we're inside an an IPython zmq frontend + """ + try: + ip = get_ipython() + return 'zmq' in str(type(ip)).lower() + except: + pass + + return False + # Unicode consolidation # --------------------- # diff --git a/pandas/core/config.py b/pandas/core/config.py index 96cb33a45d172..59d2772c857bd 100644 --- a/pandas/core/config.py +++ b/pandas/core/config.py @@ -140,6 +140,9 @@ def _reset_option(pat): for k in keys: _set_option(k, _registered_options[k].defval) +def get_default_val(pat): + key = _get_single_key(pat, silent=True) + return _get_registered_option(key).defval class DictWrapper(object): """ provide attribute-style access to a nested dict @@ -698,7 +701,12 @@ def is_instance_factory(_type): """ def inner(x): - if not isinstance(x, _type): + if isinstance(_type,(tuple,list)) : + if not any([isinstance(x,t) for t in _type]): + from pandas.core.common import pprint_thing as pp + pp_values = map(pp, _type) + raise ValueError("Value must be an instance of %s" % pp("|".join(pp_values))) + elif not isinstance(x, _type): raise ValueError("Value must be an instance of '%s'" % str(_type)) return inner diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 9b3acdf4cbb58..71b4539265069 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -1,6 +1,6 @@ import pandas.core.config as cf from pandas.core.config import (is_int, is_bool, is_text, is_float, - is_instance_factory,is_one_of_factory) + is_instance_factory,is_one_of_factory,get_default_val) from pandas.core.format import detect_console_encoding """ @@ -34,7 +34,8 @@ : int This sets the maximum number of rows pandas should output when printing out various output. For example, this value determines whether the repr() - for a dataframe prints out fully or just an summary repr. + for a dataframe prints out fully or just a summary repr. + 'None' value means unlimited. """ pc_max_cols_doc = """ @@ -46,6 +47,7 @@ format in case all columns would not fit vertically. The IPython notebook, IPython qtconsole, or IDLE do not run in a terminal and hence it is not possible to do correct auto-detection. + 'None' value means unlimited. """ pc_max_info_cols_doc = """ @@ -122,23 +124,24 @@ """ pc_line_width_deprecation_warning = """\ -use display.width instead (currently both are indentical) +line_width has been deprecated, use display.width instead (currently both are identical) """ pc_width_doc = """ : int - Width of the display. In case python/IPython is running in a terminal this - can be set to 0 and pandas will correctly auto-detect the width. Note that - the IPython notebook, IPython qtconsole, or IDLE do not run in a terminal - and hence it is not possible to correctly detect the width. + Width of the display in characters. In case python/IPython is running in + a terminal this can be set to None and pandas will correctly auto-detect the + width. + Note that the IPython notebook, IPython qtconsole, or IDLE do not run in a + terminal and hence it is not possible to correctly detect the width. """ pc_height_doc = """ : int - Height of the display. In case python/IPython is running in a terminal this - can be set to 0 and pandas will auto-detect the width. Note that the - IPython notebook, IPython qtconsole, or IDLE do not run in a terminal, - and hence it is not possible to correctly detect the height. + Height of the display in lines. In case python/IPython is running in a + terminal this can be set to None and pandas will auto-detect the width. + Note that the IPython notebook, IPython qtconsole, or IDLE do not run + in a terminal, and hence it is not possible to correctly detect the height. """ pc_chop_threshold_doc = """ @@ -208,9 +211,11 @@ def mpl_style_cb(key): cf.register_option('column_space', 12, validator=is_int) cf.register_option('max_info_rows', 1690785, pc_max_info_rows_doc, validator=is_instance_factory((int, type(None)))) - cf.register_option('max_rows', 100, pc_max_rows_doc, validator=is_int) + cf.register_option('max_rows', 60, pc_max_rows_doc, + validator=is_instance_factory([type(None), int])) cf.register_option('max_colwidth', 50, max_colwidth_doc, validator=is_int) - cf.register_option('max_columns', 20, pc_max_cols_doc, validator=is_int) + cf.register_option('max_columns', 20, pc_max_cols_doc, + validator=is_instance_factory([type(None), int])) cf.register_option('max_info_columns', 100, pc_max_info_cols_doc, validator=is_int) cf.register_option('colheader_justify', 'right', colheader_justify_doc, @@ -228,14 +233,17 @@ def mpl_style_cb(key): cf.register_option('encoding', detect_console_encoding(), pc_encoding_doc, validator=is_text) cf.register_option('expand_frame_repr', True, pc_expand_repr_doc) - cf.register_option('line_width', 80, pc_line_width_doc) cf.register_option('chop_threshold', None, pc_chop_threshold_doc) cf.register_option('max_seq_items', None, pc_max_seq_items) cf.register_option('mpl_style', None, pc_mpl_style_doc, validator=is_one_of_factory([None, False, 'default']), cb=mpl_style_cb) - cf.register_option('height', 100, pc_height_doc, validator=is_int) - cf.register_option('width',80, pc_width_doc, validator=is_int) + cf.register_option('height', 60, pc_height_doc, + validator=is_instance_factory([type(None), int])) + cf.register_option('width',80, pc_width_doc, + validator=is_instance_factory([type(None), int])) + # redirected to width, make defval identical + cf.register_option('line_width', get_default_val('display.width'), pc_line_width_doc) cf.deprecate_option('display.line_width', msg=pc_line_width_deprecation_warning, rkey='display.width') diff --git a/pandas/core/daterange.py b/pandas/core/daterange.py index 954d72defdbbb..9ddd76c471d44 100644 --- a/pandas/core/daterange.py +++ b/pandas/core/daterange.py @@ -9,6 +9,8 @@ # DateRange class class DateRange(Index): + """Deprecated + """ offset = tzinfo = None diff --git a/pandas/core/format.py b/pandas/core/format.py index 0eb4a2a4e5e08..0f0029167ce64 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -12,6 +12,7 @@ from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.util import py3compat from pandas.util.compat import OrderedDict +from pandas.util.terminal import get_terminal_size from pandas.core.config import get_option, set_option, reset_option import pandas.core.common as com import pandas.lib as lib @@ -165,7 +166,9 @@ def _encode_diff_func(): encoding = get_option("display.encoding") def _encode_diff(x): - return len(x) - len(x.decode(encoding)) + if not isinstance(x,unicode): + return len(x) - len(x.decode(encoding)) + return 0 return _encode_diff @@ -354,7 +357,7 @@ def get_col_type(dtype): return 'r' else: return 'l' - + import warnings if force_unicode is not None: # pragma: no cover warnings.warn( @@ -370,7 +373,7 @@ def get_col_type(dtype): strcols = [[info_line]] else: strcols = self._to_str_columns() - + if column_format is None: dtypes = self.frame.dtypes.values column_format = 'l%s' % ''.join(map(get_col_type, dtypes)) @@ -1639,13 +1642,14 @@ def reset_printoptions(): FutureWarning) reset_option("^display\.") - +_initial_defencoding = None def detect_console_encoding(): """ Try to find the most capable encoding supported by the console. slighly modified from the way IPython handles the same issue. """ import locale + global _initial_defencoding encoding = None try: @@ -1653,18 +1657,60 @@ def detect_console_encoding(): except AttributeError: pass - if not encoding or encoding == 'ascii': # try again for something better + if not encoding or 'ascii' in encoding.lower(): # try again for something better try: encoding = locale.getpreferredencoding() except Exception: pass - if not encoding: # when all else fails. this will usually be "ascii" + if not encoding or 'ascii' in encoding.lower(): # when all else fails. this will usually be "ascii" encoding = sys.getdefaultencoding() + # GH3360, save the reported defencoding at import time + # MPL backends may change it. Make available for debugging. + if not _initial_defencoding: + _initial_defencoding = sys.getdefaultencoding() + return encoding +def get_console_size(): + """Return console size as tuple = (width, height). + + May return (None,None) in some cases. + """ + display_width = get_option('display.width') + display_height = get_option('display.height') + + # Consider + # interactive shell terminal, can detect term size + # interactive non-shell terminal (ipnb/ipqtconsole), cannot detect term size + # non-interactive script, should disregard term size + + # in addition + # width,height have default values, but setting to 'None' signals + # should use Auto-Detection, But only in interactive shell-terminal. + # Simple. yeah. + + if com.in_interactive_session(): + if com.in_ipnb_frontend(): + # sane defaults for interactive non-shell terminal + # match default for width,height in config_init + from pandas.core.config import get_default_val + terminal_width = get_default_val('display.width') + terminal_height = get_default_val('display.height') + else: + # pure terminal + terminal_width, terminal_height = get_terminal_size() + else: + terminal_width, terminal_height = None,None + + # Note if the User sets width/Height to None (auto-detection) + # and we're in a script (non-inter), this will return (None,None) + # caller needs to deal. + return (display_width or terminal_width, display_height or terminal_height) + + class EngFormatter(object): """ Formats float values according to engineering format. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ab7d23acf183e..6fd627f42e055 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -333,6 +333,41 @@ def f(self, other): class DataFrame(NDFrame): + """ Two-dimensional size-mutable, potentially heterogeneous tabular data + structure with labeled axes (rows and columns). Arithmetic operations + align on both row and column labels. Can be thought of as a dict-like + container for Series objects. The primary pandas data structure + + Parameters + ---------- + data : numpy ndarray (structured or homogeneous), dict, or DataFrame + Dict can contain Series, arrays, constants, or list-like objects + index : Index or array-like + Index to use for resulting frame. Will default to np.arange(n) if + no indexing information part of input data and no index provided + columns : Index or array-like + Will default to np.arange(n) if not column labels provided + dtype : dtype, default None + Data type to force, otherwise infer + copy : boolean, default False + Copy data from inputs. Only affects DataFrame / 2d ndarray input + + Examples + -------- + >>> d = {'col1': ts1, 'col2': ts2} + >>> df = DataFrame(data=d, index=index) + >>> df2 = DataFrame(np.random.randn(10, 5)) + >>> df3 = DataFrame(np.random.randn(10, 5), + ... columns=['a', 'b', 'c', 'd', 'e']) + + See also + -------- + DataFrame.from_records: constructor from tuples, also record arrays + DataFrame.from_dict: from dicts of Series, arrays, or dicts + DataFrame.from_csv: from CSV files + DataFrame.from_items: from sequence of (key, value) pairs + read_csv / read_table / read_clipboard + """ _auto_consolidate = True _het_axis = 1 _info_axis = 'columns' @@ -347,41 +382,6 @@ class DataFrame(NDFrame): def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False): - """Two-dimensional size-mutable, potentially heterogeneous tabular data - structure with labeled axes (rows and columns). Arithmetic operations - align on both row and column labels. Can be thought of as a dict-like - container for Series objects. The primary pandas data structure - - Parameters - ---------- - data : numpy ndarray (structured or homogeneous), dict, or DataFrame - Dict can contain Series, arrays, constants, or list-like objects - index : Index or array-like - Index to use for resulting frame. Will default to np.arange(n) if - no indexing information part of input data and no index provided - columns : Index or array-like - Will default to np.arange(n) if not column labels provided - dtype : dtype, default None - Data type to force, otherwise infer - copy : boolean, default False - Copy data from inputs. Only affects DataFrame / 2d ndarray input - - Examples - -------- - >>> d = {'col1': ts1, 'col2': ts2} - >>> df = DataFrame(data=d, index=index) - >>> df2 = DataFrame(np.random.randn(10, 5)) - >>> df3 = DataFrame(np.random.randn(10, 5), - ... columns=['a', 'b', 'c', 'd', 'e']) - - See also - -------- - DataFrame.from_records: constructor from tuples, also record arrays - DataFrame.from_dict: from dicts of Series, arrays, or dicts - DataFrame.from_csv: from CSV files - DataFrame.from_items: from sequence of (key, value) pairs - read_csv / read_table / read_clipboard - """ if data is None: data = {} @@ -602,18 +602,20 @@ def __nonzero__(self): def _repr_fits_vertical_(self): """ Check if full repr fits in vertical boundaries imposed by the display - options height and max_columns. In case off non-interactive session, + options height and max_rows. In case of non-interactive session, no boundaries apply. """ - if not com.in_interactive_session(): - return True + width, height = fmt.get_console_size() + max_rows = get_option("display.max_rows") - terminal_width, terminal_height = get_terminal_size() + if height is None and max_rows is None: + return True - # excluding column axis area - max_rows = get_option("display.max_rows") or terminal_height - display_height = get_option("display.height") or terminal_height - return len(self.index) <= min(max_rows, display_height) + else: + # min of two, where one may be None + height = height or max_rows +1 + max_rows = max_rows or height +1 + return len(self) <= min(max_rows, height) def _repr_fits_horizontal_(self): """ @@ -621,23 +623,35 @@ def _repr_fits_horizontal_(self): options width and max_columns. In case off non-interactive session, no boundaries apply. """ - if not com.in_interactive_session(): - return True - - terminal_width, terminal_height = get_terminal_size() - + width, height = fmt.get_console_size() max_columns = get_option("display.max_columns") - display_width = get_option("display.width") or terminal_width nb_columns = len(self.columns) + + # exceed max columns if ((max_columns and nb_columns > max_columns) or - (nb_columns > (display_width // 2))): + (width and nb_columns > (width // 2))): return False + if width is None: + # no sense finding width of repr if no width set + return True + buf = StringIO() - self.to_string(buf=buf) + + # only care about the stuff we'll actually print out + # and to_string on entire frame may be expensive + d = self + max_rows = get_option("display.max_rows") + if not (height is None and max_rows is None): + # min of two, where one may be None + height = height or max_rows +1 + max_rows = max_rows or height +1 + d=d.iloc[:min(max_rows, height,len(d))] + + d.to_string(buf=buf) value = buf.getvalue() repr_width = max([len(l) for l in value.split('\n')]) - return repr_width <= display_width + return repr_width <= width def __str__(self): """ @@ -670,19 +684,24 @@ def __unicode__(self): """ buf = StringIO(u"") fits_vertical = self._repr_fits_vertical_() - fits_horizontal = self._repr_fits_horizontal_() + fits_horizontal = False + if fits_vertical: + # This needs to compute the entire repr + # so don't do it unless rownum is bounded + fits_horizontal = self._repr_fits_horizontal_() + if fits_vertical and fits_horizontal: self.to_string(buf=buf) else: - terminal_width, terminal_height = get_terminal_size() - max_rows = get_option("display.max_rows") or terminal_height - # Expand or info? Decide based on option display.expand_frame_repr - # and keep it sane for the number of display rows used by the - # expanded repr. + width, height = fmt.get_console_size() + max_rows = get_option("display.max_rows") or height + # expand_repr basically takes the extrac columns that don't + # fit the width, and creates a new page, which increases + # the effective row count. check number of cols agaibst + # max rows to catch wrapping. that would exceed max_rows. if (get_option("display.expand_frame_repr") and fits_vertical and len(self.columns) < max_rows): - line_width = get_option("display.width") or terminal_width - self.to_string(buf=buf, line_width=line_width) + self.to_string(buf=buf, line_width=width) else: max_info_rows = get_option('display.max_info_rows') verbose = (max_info_rows is None or @@ -707,11 +726,14 @@ def _repr_html_(self): Return a html representation for a particular DataFrame. Mainly for IPython notebook. """ - if com.in_qtconsole(): - raise ValueError('Disable HTML output in QtConsole') if get_option("display.notebook_repr_html"): - if self._repr_fits_horizontal_() and self._repr_fits_vertical_(): + fits_vertical = self._repr_fits_vertical_() + fits_horizontal = False + if fits_vertical: + fits_horizontal = self._repr_fits_horizontal_() + + if fits_horizontal and fits_vertical: return ('
\n' + self.to_html() + '\n
') @@ -1580,7 +1602,7 @@ def info(self, verbose=True, buf=None, max_cols=None): # hack if max_cols is None: - max_cols = get_option('display.max_info_columns') + max_cols = get_option('display.max_info_columns',len(self.columns)+1) if verbose and len(self.columns) <= max_cols: lines.append('Data columns (total %d columns):' % len(self.columns)) @@ -2226,8 +2248,9 @@ def xs(self, key, axis=0, level=None, copy=True): raise ValueError('Cannot retrieve view (copy=False)') # level = 0 - if not isinstance(loc, slice): - indexer = [slice(None, None)] * 2 + loc_is_slice = isinstance(loc, slice) + if not loc_is_slice: + indexer = [slice(None)] * 2 indexer[axis] = loc indexer = tuple(indexer) else: @@ -2237,10 +2260,9 @@ def xs(self, key, axis=0, level=None, copy=True): indexer = self.index[loc] # select on the correct axis - if axis == 1: - result = self.ix[:, indexer] - else: - result = self.ix[indexer] + if axis == 1 and loc_is_slice: + indexer = slice(None), indexer + result = self.ix[indexer] setattr(result, result._get_axis_name(axis), new_ax) return result @@ -4930,25 +4952,10 @@ def _get_agg_axis(self, axis_num): raise Exception('Must have 0<= axis <= 1') def _get_numeric_data(self): - if self._is_mixed_type: - num_data = self._data.get_numeric_data() - return DataFrame(num_data, index=self.index, copy=False) - else: - if (self.values.dtype != np.object_ and - not issubclass(self.values.dtype.type, np.datetime64)): - return self - else: - return self.ix[:, []] + return self._constructor(self._data.get_numeric_data(), index=self.index, copy=False) def _get_bool_data(self): - if self._is_mixed_type: - bool_data = self._data.get_bool_data() - return DataFrame(bool_data, index=self.index, copy=False) - else: # pragma: no cover - if self.values.dtype == np.bool_: - return self - else: - return self.ix[:, []] + return self._constructor(self._data.get_bool_data(), index=self.index, copy=False) def quantile(self, q=0.5, axis=0, numeric_only=True): """ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 8b3fb4c2fba0d..aef44bd91396d 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -620,7 +620,9 @@ def apply(self, f, data, axis=0, keep_internal=False): try: values, mutated = splitter.fast_apply(f, group_keys) return group_keys, values, mutated - except lib.InvalidApply: + except (Exception), detail: + # we detect a mutatation of some kind + # so take slow path pass result_values = [] diff --git a/pandas/core/index.py b/pandas/core/index.py index 43cb7734a1cc5..6bdd4d89831b9 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1,7 +1,5 @@ # pylint: disable=E1101,E1103,W0232 -from datetime import time - from itertools import izip import numpy as np @@ -1282,6 +1280,25 @@ def drop(self, labels): class Int64Index(Index): + """ + Immutable ndarray implementing an ordered, sliceable set. The basic object + storing axis labels for all pandas objects. Int64Index is a special case of `Index` + with purely integer labels. This is the default index type used by the DataFrame + and Series ctors when no explicit index is provided by the user. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype (default: object) + copy : bool + Make a copy of input ndarray + name : object + Name to be stored in the index + + Note + ---- + An Index instance can **only** contain hashable objects + """ _groupby = _algos.groupby_int64 _arrmap = _algos.arrmap_int64 @@ -1485,7 +1502,7 @@ def __unicode__(self): np.set_printoptions(threshold=50) if len(self) > 100: - values = self[:50].format() + self[-50:].format() + values = self[:50].format() + ["..."] + self[-50:].format() else: values = self.format() diff --git a/pandas/core/internals.py b/pandas/core/internals.py index b44ef5d465bb9..94029e3212057 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -30,9 +30,7 @@ class Block(object): _can_hold_na = False _downcast_dtype = None - def __init__(self, values, items, ref_items, ndim=2): - if issubclass(values.dtype.type, basestring): - values = np.array(values, dtype=object) + def __init__(self, values, items, ref_items, ndim=2, fastpath=False): if values.ndim != ndim: raise ValueError('Wrong number of dimensions') @@ -44,8 +42,13 @@ def __init__(self, values, items, ref_items, ndim=2): self._ref_locs = None self.values = values self.ndim = ndim - self.items = _ensure_index(items) - self.ref_items = _ensure_index(ref_items) + + if fastpath: + self.items = items + self.ref_items = ref_items + else: + self.items = _ensure_index(items) + self.ref_items = _ensure_index(ref_items) def _gi(self, arg): return self.values[arg] @@ -114,7 +117,7 @@ def copy(self, deep=True): values = self.values if deep: values = values.copy() - return make_block(values, self.items, self.ref_items) + return make_block(values, self.items, self.ref_items, klass=self.__class__, fastpath=True) def merge(self, other): if not self.ref_items.equals(other.ref_items): @@ -133,7 +136,7 @@ def reindex_axis(self, indexer, axis=1, fill_value=np.nan, mask_info=None): raise AssertionError('axis must be at least 1, got %d' % axis) new_values = com.take_nd(self.values, indexer, axis, fill_value=fill_value, mask_info=mask_info) - return make_block(new_values, self.items, self.ref_items) + return make_block(new_values, self.items, self.ref_items, fastpath=True) def reindex_items_from(self, new_ref_items, copy=True): """ @@ -155,7 +158,7 @@ def reindex_items_from(self, new_ref_items, copy=True): new_values = com.take_nd(self.values, masked_idx, axis=0, allow_fill=False) new_items = self.items.take(masked_idx) - return make_block(new_values, new_items, new_ref_items) + return make_block(new_values, new_items, new_ref_items, fastpath=True) def get(self, item): loc = self.items.get_loc(item) @@ -181,7 +184,7 @@ def delete(self, item): loc = self.items.get_loc(item) new_items = self.items.delete(loc) new_values = np.delete(self.values, loc, 0) - return make_block(new_values, new_items, self.ref_items) + return make_block(new_values, new_items, self.ref_items, klass=self.__class__, fastpath=True) def split_block_at(self, item): """ @@ -204,7 +207,9 @@ def split_block_at(self, item): for s, e in com.split_ranges(mask): yield make_block(self.values[s:e], self.items[s:e].copy(), - self.ref_items) + self.ref_items, + klass=self.__class__, + fastpath=True) def fillna(self, value, inplace=False, downcast=None): if not self._can_hold_na: @@ -217,7 +222,7 @@ def fillna(self, value, inplace=False, downcast=None): mask = com.isnull(new_values) np.putmask(new_values, mask, value) - block = make_block(new_values, self.items, self.ref_items) + block = make_block(new_values, self.items, self.ref_items, fastpath=True) if downcast: block = block.downcast() return block @@ -251,7 +256,7 @@ def astype(self, dtype, copy = True, raise_on_error = True): """ try: newb = make_block(com._astype_nansafe(self.values, dtype, copy = copy), - self.items, self.ref_items) + self.items, self.ref_items, fastpath=True) except: if raise_on_error is True: raise @@ -365,14 +370,14 @@ def putmask(self, mask, new, inplace=False): nv = new_values[i] if inplace else new_values[i].copy() nv = _block_shape(nv) - new_blocks.append(make_block(nv, [ item ], self.ref_items)) + new_blocks.append(make_block(nv, Index([ item ]), self.ref_items, fastpath=True)) return new_blocks if inplace: return [ self ] - return [ make_block(new_values, self.items, self.ref_items) ] + return [ make_block(new_values, self.items, self.ref_items, fastpath=True) ] def interpolate(self, method='pad', axis=0, inplace=False, limit=None, missing=None, coerce=False): @@ -403,14 +408,14 @@ def interpolate(self, method='pad', axis=0, inplace=False, else: com.backfill_2d(transf(values), limit=limit, mask=mask) - return make_block(values, self.items, self.ref_items) + return make_block(values, self.items, self.ref_items, klass=self.__class__, fastpath=True) - def take(self, indexer, axis=1): + def take(self, indexer, ref_items, axis=1): if axis < 1: raise AssertionError('axis must be at least 1, got %d' % axis) new_values = com.take_nd(self.values, indexer, axis=axis, allow_fill=False) - return make_block(new_values, self.items, self.ref_items) + return make_block(new_values, self.items, ref_items, klass=self.__class__, fastpath=True) def get_values(self, dtype): return self.values @@ -418,7 +423,7 @@ def get_values(self, dtype): def diff(self, n): """ return block for the diff of the values """ new_values = com.diff(self.values, n, axis=1) - return make_block(new_values, self.items, self.ref_items) + return make_block(new_values, self.items, self.ref_items, fastpath=True) def shift(self, indexer, periods): """ shift the block by periods, possibly upcast """ @@ -431,7 +436,7 @@ def shift(self, indexer, periods): new_values[:, :periods] = fill_value else: new_values[:, periods:] = fill_value - return make_block(new_values, self.items, self.ref_items) + return make_block(new_values, self.items, self.ref_items, fastpath=True) def eval(self, func, other, raise_on_error = True, try_cast = False): """ @@ -486,7 +491,7 @@ def eval(self, func, other, raise_on_error = True, try_cast = False): if try_cast: result = self._try_cast_result(result) - return make_block(result, self.items, self.ref_items) + return make_block(result, self.items, self.ref_items, fastpath=True) def where(self, other, cond, raise_on_error = True, try_cast = False): """ @@ -551,7 +556,7 @@ def func(c,v,o): result.fill(np.nan) return result - def create_block(result, items, transpose = True): + def create_block(result, items, transpose=True): if not isinstance(result, np.ndarray): raise TypeError('Could not compare [%s] with block values' % repr(other)) @@ -581,7 +586,7 @@ def create_block(result, items, transpose = True): result = np.repeat(result,self.shape[1:]) result = _block_shape(result,ndim=self.ndim,shape=self.shape[1:]) - result_blocks.append(create_block(result, item, transpose = False)) + result_blocks.append(create_block(result, item, transpose=False)) return result_blocks else: @@ -683,6 +688,12 @@ class ObjectBlock(Block): is_object = True _can_hold_na = True + def __init__(self, values, items, ref_items, ndim=2, fastpath=False): + if issubclass(values.dtype.type, basestring): + values = np.array(values, dtype=object) + + super(ObjectBlock, self).__init__(values, items, ref_items, ndim=ndim, fastpath=fastpath) + @property def is_bool(self): """ we can be a bool if we have only bool values but are of type object """ @@ -704,7 +715,7 @@ def convert(self, convert_dates = True, convert_numeric = True, copy = True): values = com._possibly_convert_objects(values, convert_dates=convert_dates, convert_numeric=convert_numeric) values = _block_shape(values) items = self.items.take([i]) - newb = make_block(values, items, self.ref_items) + newb = make_block(values, items, self.ref_items, fastpath=True) blocks.append(newb) return blocks @@ -727,11 +738,11 @@ def should_store(self, value): class DatetimeBlock(Block): _can_hold_na = True - def __init__(self, values, items, ref_items, ndim=2): + def __init__(self, values, items, ref_items, ndim=2, fastpath=True): if values.dtype != _NS_DTYPE: values = tslib.cast_to_nanoseconds(values) - Block.__init__(self, values, items, ref_items, ndim=ndim) + super(DatetimeBlock, self).__init__(values, items, ref_items, ndim=ndim, fastpath=fastpath) def _gi(self, arg): return lib.Timestamp(self.values[arg]) @@ -813,40 +824,41 @@ def get_values(self, dtype): return self.values -def make_block(values, items, ref_items): - dtype = values.dtype - vtype = dtype.type - klass = None - - if issubclass(vtype, np.floating): - klass = FloatBlock - elif issubclass(vtype, np.complexfloating): - klass = ComplexBlock - elif issubclass(vtype, np.datetime64): - klass = DatetimeBlock - elif issubclass(vtype, np.integer): - klass = IntBlock - elif dtype == np.bool_: - klass = BoolBlock - - # try to infer a datetimeblock - if klass is None and np.prod(values.shape): - flat = values.ravel() - inferred_type = lib.infer_dtype(flat) - if inferred_type == 'datetime': - - # we have an object array that has been inferred as datetime, so - # convert it - try: - values = tslib.array_to_datetime(flat).reshape(values.shape) - klass = DatetimeBlock - except: # it already object, so leave it - pass +def make_block(values, items, ref_items, klass = None, fastpath=False): if klass is None: - klass = ObjectBlock - - return klass(values, items, ref_items, ndim=values.ndim) + dtype = values.dtype + vtype = dtype.type + + if issubclass(vtype, np.floating): + klass = FloatBlock + elif issubclass(vtype, np.complexfloating): + klass = ComplexBlock + elif issubclass(vtype, np.datetime64): + klass = DatetimeBlock + elif issubclass(vtype, np.integer): + klass = IntBlock + elif dtype == np.bool_: + klass = BoolBlock + + # try to infer a datetimeblock + if klass is None and np.prod(values.shape): + flat = values.ravel() + inferred_type = lib.infer_dtype(flat) + if inferred_type == 'datetime': + + # we have an object array that has been inferred as datetime, so + # convert it + try: + values = tslib.array_to_datetime(flat).reshape(values.shape) + klass = DatetimeBlock + except: # it already object, so leave it + pass + + if klass is None: + klass = ObjectBlock + + return klass(values, items, ref_items, ndim=values.ndim, fastpath=fastpath) # TODO: flexible with index=None and/or items=None @@ -1168,8 +1180,11 @@ def get_slice(self, slobj, axis=0, raise_on_error=False): new_items = new_axes[0] if len(self.blocks) == 1: blk = self.blocks[0] - newb = make_block(blk.values[slobj], new_items, - new_items) + newb = make_block(blk.values[slobj], + new_items, + new_items, + klass=blk.__class__, + fastpath=True) new_blocks = [newb] else: return self.reindex_items(new_items) @@ -1186,8 +1201,11 @@ def _slice_blocks(self, slobj, axis): slicer = tuple(slicer) for block in self.blocks: - newb = make_block(block.values[slicer], block.items, - block.ref_items) + newb = make_block(block.values[slicer], + block.items, + block.ref_items, + klass=block.__class__, + fastpath=True) new_blocks.append(newb) return new_blocks @@ -1296,13 +1314,22 @@ def xs(self, key, axis=1, copy=True): raise Exception('cannot get view of mixed-type or ' 'non-consolidated DataFrame') for blk in self.blocks: - newb = make_block(blk.values[slicer], blk.items, blk.ref_items) + newb = make_block(blk.values[slicer], + blk.items, + blk.ref_items, + klass=blk.__class__, + fastpath=True) new_blocks.append(newb) elif len(self.blocks) == 1: - vals = self.blocks[0].values[slicer] + block = self.blocks[0] + vals = block.values[slicer] if copy: vals = vals.copy() - new_blocks = [make_block(vals, self.items, self.items)] + new_blocks = [make_block(vals, + self.items, + self.items, + klass=block.__class__, + fastpath=True)] return BlockManager(new_blocks, new_axes) @@ -1491,7 +1518,7 @@ def _add_new_block(self, item, value, loc=None): if loc is None: loc = self.items.get_loc(item) new_block = make_block(value, self.items[loc:loc + 1].copy(), - self.items) + self.items, fastpath=True) self.blocks.append(new_block) def _find_block(self, item): @@ -1569,7 +1596,7 @@ def _reindex_indexer_items(self, new_items, indexer, fill_value): new_values = com.take_nd(blk.values, blk_indexer[selector], axis=0, allow_fill=False) new_blocks.append(make_block(new_values, new_block_items, - new_items)) + new_items, fastpath=True)) if not mask.all(): na_items = new_items[-mask] @@ -1593,7 +1620,7 @@ def reindex_items(self, new_items, copy=True, fill_value=np.nan): # TODO: this part could be faster (!) new_items, indexer = self.items.reindex(new_items) - # could have some pathological (MultiIndex) issues here + # could have so me pathological (MultiIndex) issues here new_blocks = [] if indexer is None: for blk in self.blocks: @@ -1630,7 +1657,7 @@ def _make_na_block(self, items, ref_items, fill_value=np.nan): na_block = make_block(block_values, items, ref_items) return na_block - def take(self, indexer, axis=1, verify=True): + def take(self, indexer, new_index=None, axis=1, verify=True): if axis < 1: raise AssertionError('axis must be at least 1, got %d' % axis) @@ -1645,15 +1672,11 @@ def take(self, indexer, axis=1, verify=True): 'the axis length') new_axes = list(self.axes) - new_axes[axis] = self.axes[axis].take(indexer) - new_blocks = [] - for blk in self.blocks: - new_values = com.take_nd(blk.values, indexer, axis=axis, - allow_fill=False) - newb = make_block(new_values, blk.items, self.items) - new_blocks.append(newb) + if new_index is None: + new_index = self.axes[axis].take(indexer) - return BlockManager(new_blocks, new_axes) + new_axes[axis] = new_index + return self.apply('take',axes=new_axes,indexer=indexer,ref_items=new_axes[0],axis=axis) def merge(self, other, lsuffix=None, rsuffix=None): if not self._is_indexed_like(other): diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 8e18e93e955ef..4f346d2e1860e 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -147,6 +147,24 @@ def f(self, other): class Panel(NDFrame): + """ + Represents wide format panel data, stored as 3-dimensional array + + Parameters + ---------- + data : ndarray (items x major x minor), or dict of DataFrames + items : Index or array-like + axis=1 + major_axis : Index or array-like + axis=1 + minor_axis : Index or array-like + axis=2 + dtype : dtype, default None + Data type to force, otherwise infer + copy : boolean, default False + Copy data from inputs. Only affects DataFrame / 2d ndarray input + """ + _AXIS_ORDERS = ['items', 'major_axis', 'minor_axis'] _AXIS_NUMBERS = dict([(a, i) for i, a in enumerate(_AXIS_ORDERS)]) _AXIS_ALIASES = { @@ -218,23 +236,6 @@ def _construct_axes_dict_for_slice(self, axes=None, **kwargs): def __init__(self, data=None, items=None, major_axis=None, minor_axis=None, copy=False, dtype=None): - """ - Represents wide format panel data, stored as 3-dimensional array - - Parameters - ---------- - data : ndarray (items x major x minor), or dict of DataFrames - items : Index or array-like - axis=1 - major_axis : Index or array-like - axis=1 - minor_axis : Index or array-like - axis=2 - dtype : dtype, default None - Data type to force, otherwise infer - copy : boolean, default False - Copy data from inputs. Only affects DataFrame / 2d ndarray input - """ self._init_data( data=data, items=items, major_axis=major_axis, minor_axis=minor_axis, copy=copy, dtype=dtype) diff --git a/pandas/core/panel4d.py b/pandas/core/panel4d.py index b2fb2d25e2355..4113832f086fb 100644 --- a/pandas/core/panel4d.py +++ b/pandas/core/panel4d.py @@ -11,12 +11,8 @@ 'minor_axis': 'minor_axis'}, slicer=Panel, axis_aliases={'major': 'major_axis', 'minor': 'minor_axis'}, - stat_axis=2) - - -def panel4d_init(self, data=None, labels=None, items=None, major_axis=None, - minor_axis=None, copy=False, dtype=None): - """ + stat_axis=2, + ns=dict(__doc__= """ Represents a 4 dimensonal structured Parameters @@ -33,6 +29,14 @@ def panel4d_init(self, data=None, labels=None, items=None, major_axis=None, copy : boolean, default False Copy data from inputs. Only affects DataFrame / 2d ndarray input """ + + ) + ) + + +def panel4d_init(self, data=None, labels=None, items=None, major_axis=None, + minor_axis=None, copy=False, dtype=None): + self._init_data(data=data, labels=labels, items=items, major_axis=major_axis, minor_axis=minor_axis, copy=copy, dtype=dtype) diff --git a/pandas/core/panelnd.py b/pandas/core/panelnd.py index ce9b43aabaa5b..08ff3b70dcb13 100644 --- a/pandas/core/panelnd.py +++ b/pandas/core/panelnd.py @@ -3,7 +3,7 @@ import pandas.lib as lib -def create_nd_panel_factory(klass_name, axis_orders, axis_slices, slicer, axis_aliases=None, stat_axis=2): +def create_nd_panel_factory(klass_name, axis_orders, axis_slices, slicer, axis_aliases=None, stat_axis=2,ns=None): """ manufacture a n-d class: parameters @@ -35,7 +35,8 @@ def create_nd_panel_factory(klass_name, axis_orders, axis_slices, slicer, axis_a raise Exception("cannot create this slicer [%s]" % slicer) # build the klass - klass = type(klass_name, (slicer,), {}) + ns = {} if not ns else ns + klass = type(klass_name, (slicer,), ns) # add the class variables klass._AXIS_ORDERS = axis_orders diff --git a/pandas/core/series.py b/pandas/core/series.py index 919dd57ee70ab..a68234b5d6bc1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -386,6 +386,33 @@ def f(self, axis=0, dtype=None, out=None, skipna=True, level=None): class Series(pa.Array, generic.PandasObject): + """ + One-dimensional ndarray with axis labels (including time series). + Labels need not be unique but must be any hashable type. The object + supports both integer- and label-based indexing and provides a host of + methods for performing operations involving the index. Statistical + methods from ndarray have been overridden to automatically exclude + missing data (currently represented as NaN) + + Operations between Series (+, -, /, *, **) align values based on their + associated index values-- they need not be the same length. The result + index will be the sorted union of the two indexes. + + Parameters + ---------- + data : array-like, dict, or scalar value + Contains data stored in Series + index : array-like or Index (1d) + Values must be unique and hashable, same length as data. Index + object (or other iterable of same length as data) Will default to + np.arange(len(data)) if not provided. If both a dict and index + sequence are used, the index will override the keys found in the + dict. + dtype : numpy.dtype or None + If None, dtype will be inferred copy : boolean, default False Copy + input data + copy : boolean, default False + """ _AXIS_NUMBERS = { 'index': 0 } @@ -411,7 +438,7 @@ def __new__(cls, data=None, index=None, dtype=None, name=None, elif isinstance(data, dict): if index is None: from pandas.util.compat import OrderedDict - if isinstance(data,OrderedDict): + if isinstance(data, OrderedDict): index = Index(data) else: index = Index(sorted(data)) @@ -482,33 +509,6 @@ def from_array(cls, arr, index=None, name=None, copy=False): def __init__(self, data=None, index=None, dtype=None, name=None, copy=False): - """ - One-dimensional ndarray with axis labels (including time series). - Labels need not be unique but must be any hashable type. The object - supports both integer- and label-based indexing and provides a host of - methods for performing operations involving the index. Statistical - methods from ndarray have been overridden to automatically exclude - missing data (currently represented as NaN) - - Operations between Series (+, -, /, *, **) align values based on their - associated index values-- they need not be the same length. The result - index will be the sorted union of the two indexes. - - Parameters - ---------- - data : array-like, dict, or scalar value - Contains data stored in Series - index : array-like or Index (1d) - Values must be unique and hashable, same length as data. Index - object (or other iterable of same length as data) Will default to - np.arange(len(data)) if not provided. If both a dict and index - sequence are used, the index will override the keys found in the - dict. - dtype : numpy.dtype or None - If None, dtype will be inferred copy : boolean, default False Copy - input data - copy : boolean, default False - """ pass @property @@ -3366,7 +3366,34 @@ def _get_fill_func(method): class TimeSeries(Series): + """ + The time series varians of Series, a One-dimensional ndarray with `TimeStamp` + axis labels. + Labels need not be unique but must be any hashable type. The object + supports both integer- and label-based indexing and provides a host of + methods for performing operations involving the index. Statistical + methods from ndarray have been overridden to automatically exclude + missing data (currently represented as NaN) + + Operations between Series (+, -, /, *, **) align values based on their + associated index values-- they need not be the same length. The result + index will be the sorted union of the two indexes. + Parameters + ---------- + data : array-like, dict, or scalar value + Contains data stored in Series + index : array-like or Index (1d) + Values must be unique and hashable, same length as data. Index + object (or other iterable of same length as data) Will default to + np.arange(len(data)) if not provided. If both a dict and index + sequence are used, the index will override the keys found in the + dict. + dtype : numpy.dtype or None + If None, dtype will be inferred copy : boolean, default False Copy + input data + copy : boolean, default False + """ def _repr_footer(self): if self.index.freq is not None: freqstr = 'Freq: %s, ' % self.index.freqstr diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index d24f36b05cabd..60798bacbc144 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -649,9 +649,10 @@ def read(self, nrows=None): def _create_index(self, col_dict, columns): pass - # backwards compatibility - get_chunk = read - + def get_chunk(self, size=None): + if size is None: + size = self.chunksize + return self.read(nrows=size) def _is_index_col(col): return col is not None and col is not False @@ -1285,7 +1286,10 @@ def read(self, rows=None): return index, columns, data # legacy - get_chunk = read + def get_chunk(self, size=None): + if size is None: + size = self.chunksize + return self.read(nrows=size) def _convert_data(self, data): # apply converters @@ -1522,7 +1526,7 @@ def _get_lines(self, rows=None): new_rows.append(next(source)) rows += 1 except csv.Error, inst: - if 'newline inside string' in inst.message: + if 'newline inside string' in str(inst): row_num = str(self.pos + rows) msg = ('EOF inside string starting with line ' + row_num) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 0568ee7f7f8bf..be11732d7b3a2 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1207,13 +1207,20 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, **kwargs): self.values = list(block.items) dtype = block.dtype.name - inferred_type = lib.infer_dtype(block.values.ravel()) + rvalues = block.values.ravel() + inferred_type = lib.infer_dtype(rvalues) if inferred_type == 'datetime64': self.set_atom_datetime64(block) elif inferred_type == 'date': raise TypeError( "[date] is not implemented as a table column") + elif inferred_type == 'datetime': + if getattr(rvalues[0],'tzinfo',None) is not None: + raise TypeError( + "timezone support on datetimes is not yet implemented as a table column") + raise TypeError( + "[datetime] is not implemented as a table column") elif inferred_type == 'unicode': raise TypeError( "[unicode] is not implemented as a table column") @@ -2080,8 +2087,18 @@ def validate(self, other): (other.table_type, self.table_type)) for c in ['index_axes','non_index_axes','values_axes']: - if getattr(self,c,None) != getattr(other,c,None): - raise ValueError("invalid combinate of [%s] on appending data [%s] vs current table [%s]" % (c,getattr(self,c,None),getattr(other,c,None))) + sv = getattr(self,c,None) + ov = getattr(other,c,None) + if sv != ov: + + # show the error for the specific axes + for i, sax in enumerate(sv): + oax = ov[i] + if sax != oax: + raise ValueError("invalid combinate of [%s] on appending data [%s] vs current table [%s]" % (c,sax,oax)) + + # should never get here + raise Exception("invalid combinate of [%s] on appending data [%s] vs current table [%s]" % (c,sv,ov)) @property def nrows_expected(self): @@ -2181,7 +2198,7 @@ def validate_min_itemsize(self, min_itemsize): if k == 'values': continue if k not in q: - raise ValueError("min_itemsize has [%s] which is not an axis or data_column" % k) + raise ValueError("min_itemsize has the key [%s] which is not an axis or data_column" % k) @property def indexables(self): @@ -2293,6 +2310,30 @@ def get_object(self, obj): """ return the data for this obj """ return obj + def validate_data_columns(self, data_columns, min_itemsize): + """ take the input data_columns and min_itemize and create a data_columns spec """ + + if not len(self.non_index_axes): + return [] + + axis_labels = self.non_index_axes[0][1] + + # evaluate the passed data_columns, True == use all columns + # take only valide axis labels + if data_columns is True: + data_columns = axis_labels + elif data_columns is None: + data_columns = [] + + # if min_itemsize is a dict, add the keys (exclude 'values') + if isinstance(min_itemsize,dict): + + existing_data_columns = set(data_columns) + data_columns.extend([ k for k in min_itemsize.keys() if k != 'values' and k not in existing_data_columns ]) + + # return valid columns in the order of our axis + return [c for c in data_columns if c in axis_labels] + def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, min_itemsize=None, **kwargs): """ create and return the axes leagcy tables create an indexable column, indexable index, non-indexable fields @@ -2380,26 +2421,18 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, for a in self.non_index_axes: obj = obj.reindex_axis(a[1], axis=a[0], copy=False) - # get out blocks + # figure out data_columns and get out blocks block_obj = self.get_object(obj) - blocks = None - - if data_columns is not None and len(self.non_index_axes): - axis = self.non_index_axes[0][0] - axis_labels = self.non_index_axes[0][1] - if data_columns is True: - data_columns = axis_labels - - data_columns = [c for c in data_columns if c in axis_labels] + blocks = block_obj._data.blocks + if len(self.non_index_axes): + axis, axis_labels = self.non_index_axes[0] + data_columns = self.validate_data_columns(data_columns, min_itemsize) if len(data_columns): blocks = block_obj.reindex_axis(Index(axis_labels) - Index( - data_columns), axis=axis, copy=False)._data.blocks + data_columns), axis=axis, copy=False)._data.blocks for c in data_columns: blocks.extend(block_obj.reindex_axis( - [c], axis=axis, copy=False)._data.blocks) - - if blocks is None: - blocks = block_obj._data.blocks + [c], axis=axis, copy=False)._data.blocks) # add my values self.values_axes = [] diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index d57b7f41b62fc..aa3fce3959860 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -456,7 +456,9 @@ def test_malformed(self): 2,3,4 """ try: - it = self.read_table(StringIO(data), sep=',', header=1, comment='#', iterator=True, chunksize=1, skiprows=[2]) + it = self.read_table(StringIO(data), sep=',', header=1, + comment='#', iterator=True, chunksize=1, + skiprows=[2]) df = it.read(1) it.read(2) self.assert_(False) @@ -876,6 +878,17 @@ def test_read_chunksize_named(self): tm.assert_frame_equal(chunks[1], df[2:4]) tm.assert_frame_equal(chunks[2], df[4:]) + def test_get_chunk_passed_chunksize(self): + data = """A,B,C +1,2,3 +4,5,6 +7,8,9 +1,2,3""" + result = self.read_csv(StringIO(data), chunksize=2) + + piece = result.get_chunk() + self.assertEqual(len(piece), 2) + def test_read_text_list(self): data = """A,B,C\nfoo,1,2,3\nbar,4,5,6""" as_list = [['A', 'B', 'C'], ['foo', '1', '2', '3'], ['bar', diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 6acf17b1220a7..75fe0eefe771e 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -694,25 +694,41 @@ def check_col(key,name,size): with ensure_clean(self.path) as store: - # infer the .typ on subsequent appends + def check_col(key,name,size): + self.assert_(getattr(store.get_storer(key).table.description,name).itemsize == size) + df = DataFrame(dict(A = 'foo', B = 'bar'),index=range(10)) + + # a min_itemsize that creates a data_column + store.remove('df') + store.append('df', df, min_itemsize={'A' : 200 }) + check_col('df', 'A', 200) + self.assert_(store.get_storer('df').data_columns == ['A']) + + # a min_itemsize that creates a data_column2 + store.remove('df') + store.append('df', df, data_columns = ['B'], min_itemsize={'A' : 200 }) + check_col('df', 'A', 200) + self.assert_(store.get_storer('df').data_columns == ['B','A']) + + # a min_itemsize that creates a data_column2 + store.remove('df') + store.append('df', df, data_columns = ['B'], min_itemsize={'values' : 200 }) + check_col('df', 'B', 200) + check_col('df', 'values_block_0', 200) + self.assert_(store.get_storer('df').data_columns == ['B']) + + # infer the .typ on subsequent appends store.remove('df') store.append('df', df[:5], min_itemsize=200) store.append('df', df[5:], min_itemsize=200) tm.assert_frame_equal(store['df'], df) # invalid min_itemsize keys - df = DataFrame(['foo','foo','foo','barh','barh','barh'],columns=['A']) - store.remove('df') self.assertRaises(ValueError, store.append, 'df', df, min_itemsize={'foo' : 20, 'foobar' : 20}) - # invalid sizes - store.remove('df') - store.append('df', df[:3], min_itemsize=3) - self.assertRaises(ValueError, store.append, 'df', df[3:]) - def test_append_with_data_columns(self): with ensure_clean(self.path) as store: @@ -1134,15 +1150,19 @@ def test_table_values_dtypes_roundtrip(self): df1['float322'] = 1. df1['float322'] = df1['float322'].astype('float32') df1['bool'] = df1['float32'] > 0 + df1['time1'] = Timestamp('20130101') + df1['time2'] = Timestamp('20130102') store.append('df_mixed_dtypes1', df1) result = store.select('df_mixed_dtypes1').get_dtype_counts() expected = Series({ 'float32' : 2, 'float64' : 1,'int32' : 1, 'bool' : 1, - 'int16' : 1, 'int8' : 1, 'int64' : 1, 'object' : 1 }) + 'int16' : 1, 'int8' : 1, 'int64' : 1, 'object' : 1, + 'datetime64[ns]' : 2}) result.sort() expected.sort() tm.assert_series_equal(result,expected) + def test_table_mixed_dtypes(self): # frame @@ -1215,6 +1235,17 @@ def test_unimplemented_dtypes_table_columns(self): # this fails because we have a date in the object block...... self.assertRaises(TypeError, store.append, 'df_unimplemented', df) + def test_table_append_with_timezones(self): + # not implemented yet + + with ensure_clean(self.path) as store: + + # check with mixed dtypes + df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern')),index=range(5)) + + # timezones not yet supported + self.assertRaises(TypeError, store.append, 'df_tz', df) + def test_remove(self): with ensure_clean(self.path) as store: diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 383b98bfc440d..82719817b5744 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -46,8 +46,14 @@ def axes(self): @property def blocks(self): """ return our series in the column order """ - s = self.sp_frame._series - return [ self.iget(i) for i in self.sp_frame.columns ] + return [ self.iget(i) for i, c in enumerate(self.sp_frame.columns) ] + + def get_numeric_data(self): + # does not check, but assuming all numeric for now + return self.sp_frame + + def get_bool_data(self): + raise NotImplementedError class SparseDataFrame(DataFrame): """ @@ -125,10 +131,13 @@ def convert_objects(self, convert_dates=True): @property def _constructor(self): - def wrapper(data, index=None, columns=None): - return SparseDataFrame(data, index=index, columns=columns, - default_fill_value=self.default_fill_value, - default_kind=self.default_kind) + def wrapper(data, index=None, columns=None, copy=False): + sf = SparseDataFrame(data, index=index, columns=columns, + default_fill_value=self.default_fill_value, + default_kind=self.default_kind) + if copy: + sf = sf.copy() + return sf return wrapper def _init_dict(self, data, index, columns, dtype=None): diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index b799188170e6f..8374c4ab9c373 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -74,6 +74,23 @@ def _sparse_series_op(left, right, op, name): class SparseSeries(SparseArray, Series): + """Data structure for labeled, sparse floating point data + + Parameters + ---------- + data : {array-like, Series, SparseSeries, dict} + kind : {'block', 'integer'} + fill_value : float + Defaults to NaN (code for missing) + sparse_index : {BlockIndex, IntIndex}, optional + Only if you have one. Mainly used internally + + Notes + ----- + SparseSeries objects are immutable via the typical Python means. If you + must change values, convert to dense, make your changes, then convert back + to sparse + """ __array_priority__ = 15 sp_index = None @@ -168,23 +185,6 @@ def from_array(cls, arr, index=None, name=None, copy=False, fill_value=None): def __init__(self, data, index=None, sparse_index=None, kind='block', fill_value=None, name=None, copy=False): - """Data structure for labeled, sparse floating point data - -Parameters ----------- -data : {array-like, Series, SparseSeries, dict} -kind : {'block', 'integer'} -fill_value : float - Defaults to NaN (code for missing) -sparse_index : {BlockIndex, IntIndex}, optional - Only if you have one. Mainly used internally - -Notes ------ -SparseSeries objects are immutable via the typical Python means. If you -must change values, convert to dense, make your changes, then convert back -to sparse - """ pass @property @@ -572,4 +572,23 @@ def combine_first(self, other): class SparseTimeSeries(SparseSeries, TimeSeries): + """Data structure for labeled, sparse floating point data, with `TimeStamp` + index labels + + Parameters + ---------- + data : {array-like, Series, SparseSeries, dict} + kind : {'block', 'integer'} + fill_value : float + Defaults to NaN (code for missing) + sparse_index : {BlockIndex, IntIndex}, optional + Only if you have one. Mainly used internally + + Notes + ----- + SparseSeries objects are immutable via the typical Python means. If you + must change values, convert to dense, make your changes, then convert back + to sparse + """ + pass diff --git a/pandas/stats/tests/test_ols.py b/pandas/stats/tests/test_ols.py index ebdb8e178d03b..17a45409c1ab5 100644 --- a/pandas/stats/tests/test_ols.py +++ b/pandas/stats/tests/test_ols.py @@ -100,6 +100,8 @@ def testWLS(self): if sm.version.version < '0.5.0': raise nose.SkipTest + print( "Make sure you're using statsmodels 0.5.0.dev-cec4f26 or later.") + X = DataFrame(np.random.randn(30, 4), columns=['A', 'B', 'C', 'D']) Y = Series(np.random.randn(30)) weights = X.std(1) diff --git a/pandas/tests/test_config.py b/pandas/tests/test_config.py index 0ae00f43c7f19..c1231df026853 100644 --- a/pandas/tests/test_config.py +++ b/pandas/tests/test_config.py @@ -39,6 +39,13 @@ def test_api(self): self.assertTrue(hasattr(pd, 'reset_option')) self.assertTrue(hasattr(pd, 'describe_option')) + def test_is_one_of_factory(self): + v = self.cf.is_one_of_factory([None,12]) + + v(12) + v(None) + self.assertRaises(ValueError,v,1.1) + def test_register_option(self): self.cf.register_option('a', 1, 'doc') diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index adbbed817ac52..e7c5d0201ca1d 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -137,14 +137,13 @@ def test_repr_obeys_max_seq_limit(self): self.assertTrue(len(com.pprint_thing(range(1000)))< 100) def test_repr_should_return_str(self): - """ - http://docs.python.org/py3k/reference/datamodel.html#object.__repr__ - http://docs.python.org/reference/datamodel.html#object.__repr__ - "...The return value must be a string object." + # http://docs.python.org/py3k/reference/datamodel.html#object.__repr__ + # http://docs.python.org/reference/datamodel.html#object.__repr__ + # "...The return value must be a string object." + + # (str on py2.x, str (unicode) on py3) - (str on py2.x, str (unicode) on py3) - """ data = [8, 5, 3, 5] index1 = [u"\u03c3", u"\u03c4", u"\u03c5", u"\u03c6"] cols = [u"\u03c8"] @@ -162,7 +161,7 @@ def test_expand_frame_repr(self): df_tall = DataFrame('hello', range(30), range(5)) with option_context('mode.sim_interactive', True): - with option_context('display.width', 50, + with option_context('display.width', 50, 'display.height', 20): with option_context('display.expand_frame_repr', True): self.assertFalse(has_info_repr(df_small)) @@ -180,6 +179,18 @@ def test_expand_frame_repr(self): self.assertTrue(has_info_repr(df_tall)) self.assertFalse(has_expanded_repr(df_tall)) + def test_repr_non_interactive(self): + # in non interactive mode, there can be no dependency on the + # result of terminal auto size detection + df = DataFrame('hello', range(1000), range(5)) + + with option_context('mode.sim_interactive', False, + 'display.width', 0, + 'display.height', 0, + 'display.max_rows',5000): + self.assertFalse(has_info_repr(df)) + self.assertFalse(has_expanded_repr(df)) + def test_repr_max_columns_max_rows(self): term_width, term_height = get_terminal_size() if term_width < 10 or term_height < 10: @@ -636,7 +647,7 @@ def test_wide_repr(self): wide_repr = repr(df) self.assert_(rep_str != wide_repr) - with option_context('display.line_width', 120): + with option_context('display.width', 120): wider_repr = repr(df) self.assert_(len(wider_repr) < len(wide_repr)) @@ -661,7 +672,7 @@ def test_wide_repr_named(self): wide_repr = repr(df) self.assert_(rep_str != wide_repr) - with option_context('display.line_width', 120): + with option_context('display.width', 150): wider_repr = repr(df) self.assert_(len(wider_repr) < len(wide_repr)) @@ -684,7 +695,7 @@ def test_wide_repr_multiindex(self): wide_repr = repr(df) self.assert_(rep_str != wide_repr) - with option_context('display.line_width', 120): + with option_context('display.width', 150): wider_repr = repr(df) self.assert_(len(wider_repr) < len(wide_repr)) @@ -709,10 +720,9 @@ def test_wide_repr_multiindex_cols(self): wide_repr = repr(df) self.assert_(rep_str != wide_repr) - with option_context('display.line_width', 120): + with option_context('display.width', 150): wider_repr = repr(df) self.assert_(len(wider_repr) < len(wide_repr)) - self.assert_(len(wide_repr.splitlines()) == 14 * 10 - 1) reset_option('display.expand_frame_repr') @@ -726,7 +736,7 @@ def test_wide_repr_unicode(self): wide_repr = repr(df) self.assert_(rep_str != wide_repr) - with option_context('display.line_width', 120): + with option_context('display.width', 150): wider_repr = repr(df) self.assert_(len(wider_repr) < len(wide_repr)) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 139a7cace83a7..4604678d58d5a 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1491,6 +1491,30 @@ def f(group): for key, group in grouped: assert_frame_equal(result.ix[key], f(group)) + def test_mutate_groups(self): + + # GH3380 + + mydf = DataFrame({ + 'cat1' : ['a'] * 8 + ['b'] * 6, + 'cat2' : ['c'] * 2 + ['d'] * 2 + ['e'] * 2 + ['f'] * 2 + ['c'] * 2 + ['d'] * 2 + ['e'] * 2, + 'cat3' : map(lambda x: 'g%s' % x, range(1,15)), + 'val' : np.random.randint(100, size=14), + }) + + def f_copy(x): + x = x.copy() + x['rank'] = x.val.rank(method='min') + return x.groupby('cat2')['rank'].min() + + def f_no_copy(x): + x['rank'] = x.val.rank(method='min') + return x.groupby('cat2')['rank'].min() + + grpby_copy = mydf.groupby('cat1').apply(f_copy) + grpby_no_copy = mydf.groupby('cat1').apply(f_no_copy) + assert_series_equal(grpby_copy,grpby_no_copy) + def test_apply_chunk_view(self): # Low level tinkering could be unsafe, make sure not df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index c3d6faf6e71b7..bc717a0fbf6d1 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -724,6 +724,11 @@ def test_xs_multiindex(self): expected = df.iloc[:,0:2].loc[:,'a'] assert_frame_equal(result,expected) + result = df.xs('foo', level='lvl1', axis=1) + expected = df.iloc[:, 1:2].copy() + expected.columns = expected.columns.droplevel('lvl1') + assert_frame_equal(result, expected) + def test_setitem_dtype_upcast(self): # GH3216 diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index d920df1ca867a..ef605abb8e4fb 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -99,10 +99,11 @@ def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean', grouped = data.groupby(keys) agged = grouped.agg(aggfunc) - to_unstack = [agged.index.names[i] - for i in range(len(rows), len(keys))] - - table = agged.unstack(to_unstack) + table = agged + if table.index.nlevels > 1: + to_unstack = [agged.index.names[i] + for i in range(len(rows), len(keys))] + table = agged.unstack(to_unstack) if isinstance(table, DataFrame): if isinstance(table.columns, MultiIndex): @@ -121,6 +122,9 @@ def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean', if values_passed and not values_multi: table = table[values[0]] + if len(rows) == 0 and len(cols) > 0: + table = table.T + return table diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index e9383e26f148a..c0e0de1a23dad 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -50,6 +50,19 @@ def test_pivot_table(self): expected = self.data.groupby(rows + [cols])['D'].agg(np.mean).unstack() tm.assert_frame_equal(table, expected) + def test_pivot_table_nocols(self): + df = DataFrame({'rows': ['a', 'b', 'c'], + 'cols': ['x', 'y', 'z'], + 'values': [1,2,3]}) + rs = df.pivot_table(cols='cols', aggfunc=np.sum) + xp = df.pivot_table(rows='cols', aggfunc=np.sum).T + tm.assert_frame_equal(rs, xp) + + rs = df.pivot_table(cols='cols', aggfunc={'values': 'mean'}) + xp = df.pivot_table(rows='cols', aggfunc={'values': 'mean'}).T + tm.assert_frame_equal(rs, xp) + + def test_pass_array(self): result = self.data.pivot_table('D', rows=self.data.A, cols=self.data.C) expected = self.data.pivot_table('D', rows='A', cols='C') diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index e8dad6c85b2ac..3b66eba31fca1 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -746,6 +746,7 @@ def infer_freq(index, warn=True): Parameters ---------- index : DatetimeIndex + warn : boolean, default True Returns ------- diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index bd95a62c3f2ed..3bc801bd38695 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -1197,6 +1197,8 @@ def generate_range(start=None, end=None, periods=None, start : datetime (default None) end : datetime (default None) periods : int, optional + time_rule : (legacy) name of DateOffset object to be used, optional + Corresponds with names expected by tseries.frequencies.get_offset Note ---- @@ -1204,6 +1206,7 @@ def generate_range(start=None, end=None, periods=None, * At least two of (start, end, periods) must be specified. * If both start and end are specified, the returned dates will satisfy start <= date <= end. + * If both time_rule and offset are specified, time_rule supersedes offset. Returns ------- diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 51903b7179822..a405fda1c4fe4 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -13,6 +13,7 @@ import pandas.core.common as com from pandas.core.common import isnull +from pandas.util import py3compat from pandas.lib import Timestamp import pandas.lib as lib @@ -40,29 +41,28 @@ def f(self): class Period(object): + """ + Represents an period of time + Parameters + ---------- + value : Period or basestring, default None + The time period represented (e.g., '4Q2005') + freq : str, default None + e.g., 'B' for businessday, ('T', 5) or '5T' for 5 minutes + year : int, default None + month : int, default 1 + quarter : int, default None + day : int, default 1 + hour : int, default 0 + minute : int, default 0 + second : int, default 0 + """ __slots__ = ['freq', 'ordinal'] def __init__(self, value=None, freq=None, ordinal=None, year=None, month=1, quarter=None, day=1, hour=0, minute=0, second=0): - """ - Represents an period of time - - Parameters - ---------- - value : Period or basestring, default None - The time period represented (e.g., '4Q2005') - freq : str, default None - e.g., 'B' for businessday, ('T', 5) or '5T' for 5 minutes - year : int, default None - month : int, default 1 - quarter : int, default None - day : int, default 1 - hour : int, default 0 - minute : int, default 0 - second : int, default 0 - """ # freq points to a tuple (base, mult); base is one of the defined # periods such as A, Q, etc. Every five minutes would be, e.g., # ('T', 5) but may be passed in as a string like '5T' @@ -265,12 +265,49 @@ def __repr__(self): base, mult = _gfc(self.freq) formatted = tslib.period_format(self.ordinal, base) freqstr = _freq_mod._reverse_period_code_map[base] + + if not py3compat.PY3: + encoding = com.get_option("display.encoding") + formatted = formatted.encode(encoding) + return "Period('%s', '%s')" % (formatted, freqstr) def __str__(self): + """ + Return a string representation for a particular DataFrame + + Invoked by str(df) in both py2/py3. + Yields Bytestring in Py2, Unicode String in py3. + """ + + if py3compat.PY3: + return self.__unicode__() + return self.__bytes__() + + def __bytes__(self): + """ + Return a string representation for a particular DataFrame + + Invoked by bytes(df) in py3 only. + Yields a bytestring in both py2/py3. + """ + encoding = com.get_option("display.encoding") + return self.__unicode__().encode(encoding, 'replace') + + def __unicode__(self): + """ + Return a string representation for a particular DataFrame + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both + py2/py3. + """ base, mult = _gfc(self.freq) formatted = tslib.period_format(self.ordinal, base) - return ("%s" % formatted) + value = (u"%s" % formatted) + assert type(value) == unicode + + return value + def strftime(self, fmt): """ diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index b20303efe222f..57f861aff8bfc 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -282,13 +282,7 @@ def _take_new_index(obj, indexer, new_index, axis=0): elif isinstance(obj, DataFrame): if axis == 1: raise NotImplementedError - data = obj._data - - new_blocks = [b.take(indexer, axis=1) for b in data.blocks] - new_axes = list(data.axes) - new_axes[1] = new_index - new_data = BlockManager(new_blocks, new_axes) - return DataFrame(new_data) + return DataFrame(obj._data.take(indexer,new_index=new_index,axis=1)) else: raise NotImplementedError diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 436254a682e8c..f34a237b55dd4 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -206,8 +206,9 @@ def test_repr(self): def test_strftime(self): p = Period('2000-1-1 12:34:12', freq='S') - self.assert_(p.strftime('%Y-%m-%d %H:%M:%S') == - '2000-01-01 12:34:12') + res = p.strftime('%Y-%m-%d %H:%M:%S') + self.assert_( res == '2000-01-01 12:34:12') + self.assert_( isinstance(res,unicode)) # GH3363 def test_sub_delta(self): left, right = Period('2011', freq='A'), Period('2007', freq='A') diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 080146c3eb36d..4d15ec8c8ace9 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -119,6 +119,11 @@ def _is_fixed_offset(tz): # Python front end to C extension type _Timestamp # This serves as the box for datetime64 class Timestamp(_Timestamp): + """TimeStamp is the pandas equivalent of python's Datetime + and is interchangable with it in most cases. It's the type used + for the entries that make up a DatetimeIndex, and other timeseries + oriented data structures in pandas. + """ @classmethod def fromordinal(cls, ordinal, offset=None, tz=None): @@ -309,7 +314,7 @@ class Timestamp(_Timestamp): class NaTType(_NaT): - + """(N)ot-(A)-(T)ime, the time equivalent of NaN""" def __new__(cls): cdef _NaT base @@ -2278,6 +2283,7 @@ cdef list extra_fmts = [(b"%q", b"^`AB`^"), cdef list str_extra_fmts = ["^`AB`^", "^`CD`^", "^`EF`^"] cdef _period_strftime(int64_t value, int freq, object fmt): + import sys cdef: Py_ssize_t i date_info dinfo @@ -2320,6 +2326,10 @@ cdef _period_strftime(int64_t value, int freq, object fmt): if not PyString_Check(result): result = str(result) + # GH3363 + if sys.version_info[0] == 2: + result = result.decode('utf-8','strict') + return result # period accessors diff --git a/pandas/util/terminal.py b/pandas/util/terminal.py index 4d269e5086b3d..7b9ddfbcfc8e6 100644 --- a/pandas/util/terminal.py +++ b/pandas/util/terminal.py @@ -22,7 +22,7 @@ def get_terminal_size(): Detect terminal size and return tuple = (width, height). Only to be used when running in a terminal. Note that the IPython notebook, - IPython qtconsole, or IDLE do not run in a terminal, + IPython zmq frontends, or IDLE do not run in a terminal, """ import platform current_os = platform.system() diff --git a/scripts/use_build_cache.py b/scripts/use_build_cache.py new file mode 100755 index 0000000000000..361ac59e5e852 --- /dev/null +++ b/scripts/use_build_cache.py @@ -0,0 +1,354 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import os + +""" +This script should be run from the repo root dir, it rewrites setup.py +to use the build cache directory specified in the envar BUILD_CACHE_DIR +or in a file named .build_cache_dir in the repo root directory. + +Artifacts included in the cache: +- gcc artifacts +- The .c files resulting from cythonizing pyx/d files +- 2to3 refactoring results (when run under python3) + +Tested on releases back to 0.7.0. + +""" + +try: + import argparse + argparser = argparse.ArgumentParser(description=""" + 'Program description. + """.strip()) + + argparser.add_argument('-f', '--force-overwrite', + default=False, + help='Setting this will overwrite any existing cache results for the current commit', + action='store_true') + argparser.add_argument('-d', '--debug', + default=False, + help='Report cache hits/misses', + action='store_true') + + args = argparser.parse_args() +except: + class Foo(object): + debug=False + force_overwrite=False + + args = Foo() # for 2.6, no argparse + +#print args.accumulate(args.integers) + +shim=""" +import os +import sys +import shutil +import warnings +import re +""" + +shim += ("BC_FORCE_OVERWRITE = %s\n" % args.force_overwrite) +shim += ("BC_DEBUG = %s\n" % args.debug) + +shim += """ +try: + if not ("develop" in sys.argv) and not ("install" in sys.argv): + 1/0 + basedir = os.path.dirname(__file__) + dotfile = os.path.join(basedir,".build_cache_dir") + BUILD_CACHE_DIR = "" + if os.path.exists(dotfile): + BUILD_CACHE_DIR = open(dotfile).readline().strip() + BUILD_CACHE_DIR = os.environ.get('BUILD_CACHE_DIR',BUILD_CACHE_DIR) + + if os.path.isdir(BUILD_CACHE_DIR): + print("--------------------------------------------------------") + print("BUILD CACHE ACTIVATED (V2). be careful, this is experimental.") + print("BUILD_CACHE_DIR: " + BUILD_CACHE_DIR ) + print("--------------------------------------------------------") + else: + BUILD_CACHE_DIR = None + + # retrieve 2to3 artifacts + if sys.version_info[0] >= 3: + from lib2to3 import refactor + from hashlib import sha1 + import shutil + import multiprocessing + pyver = "%d.%d" % (sys.version_info[:2]) + fileq = ["pandas"] + to_process = dict() + + # retrieve the hashes existing in the cache + orig_hashes=dict() + post_hashes=dict() + for path,dirs,files in os.walk(os.path.join(BUILD_CACHE_DIR,'pandas')): + for f in files: + s=f.split(".py-")[-1] + try: + prev_h,post_h,ver = s.split('-') + if ver == pyver: + orig_hashes[prev_h] = os.path.join(path,f) + post_hashes[post_h] = os.path.join(path,f) + except: + pass + + while fileq: + f = fileq.pop() + + if os.path.isdir(f): + fileq.extend([os.path.join(f,x) for x in os.listdir(f)]) + else: + if not f.endswith(".py"): + continue + else: + try: + h = sha1(open(f,"rb").read()).hexdigest() + except IOError: + to_process[h] = f + else: + if h in orig_hashes and not BC_FORCE_OVERWRITE: + src = orig_hashes[h] + if BC_DEBUG: + print("2to3 cache hit %s,%s" % (f,h)) + shutil.copyfile(src,f) + elif h not in post_hashes: + # we're not in a dev dir with already processed files + if BC_DEBUG: + print("2to3 cache miss (will process) %s,%s" % (f,h)) + to_process[h] = f + + avail_fixes = set(refactor.get_fixers_from_package("lib2to3.fixes")) + avail_fixes.discard('lib2to3.fixes.fix_next') + t=refactor.RefactoringTool(avail_fixes) + if to_process: + print("Starting 2to3 refactoring...") + for orig_h,f in to_process.items(): + if BC_DEBUG: + print("2to3 on %s" % f) + try: + t.refactor([f],True) + post_h = sha1(open(f, "rb").read()).hexdigest() + cached_fname = f + '-' + orig_h + '-' + post_h + '-' + pyver + path = os.path.join(BUILD_CACHE_DIR, cached_fname) + pathdir =os.path.dirname(path) + if BC_DEBUG: + print("cache put %s in %s" % (f, path)) + try: + os.makedirs(pathdir) + except OSError as exc: + import errno + if exc.errno == errno.EEXIST and os.path.isdir(pathdir): + pass + else: + raise + + shutil.copyfile(f, path) + + except Exception as e: + print("While processing %s 2to3 raised: %s" % (f,str(e))) + + pass + print("2to3 done refactoring.") + +except Exception as e: + if not isinstance(e,ZeroDivisionError): + print( "Exception: " + str(e)) + BUILD_CACHE_DIR = None + +class CompilationCacheMixin(object): + def __init__(self, *args, **kwds): + cache_dir = kwds.pop("cache_dir", BUILD_CACHE_DIR) + self.cache_dir = cache_dir + if not os.path.isdir(cache_dir): + raise Exception("Error: path to Cache directory (%s) is not a dir" % cache_dir) + + def _copy_from_cache(self, hash, target): + src = os.path.join(self.cache_dir, hash) + if os.path.exists(src) and not BC_FORCE_OVERWRITE: + if BC_DEBUG: + print("Cache HIT: asked to copy file %s in %s" % + (src,os.path.abspath(target))) + s = "." + for d in target.split(os.path.sep)[:-1]: + s = os.path.join(s, d) + if not os.path.exists(s): + os.mkdir(s) + shutil.copyfile(src, target) + + return True + + return False + + def _put_to_cache(self, hash, src): + target = os.path.join(self.cache_dir, hash) + if BC_DEBUG: + print( "Cache miss: asked to copy file from %s to %s" % (src,target)) + s = "." + for d in target.split(os.path.sep)[:-1]: + s = os.path.join(s, d) + if not os.path.exists(s): + os.mkdir(s) + shutil.copyfile(src, target) + + def _hash_obj(self, obj): + try: + return hash(obj) + except: + raise NotImplementedError("You must override this method") + +class CompilationCacheExtMixin(CompilationCacheMixin): + def _hash_file(self, fname): + from hashlib import sha1 + f= None + try: + hash = sha1() + hash.update(self.build_lib.encode('utf-8')) + try: + if sys.version_info[0] >= 3: + import io + f = io.open(fname, "rb") + else: + f = open(fname) + + first_line = f.readline() + # ignore cython generation timestamp header + if "Generated by Cython" not in first_line.decode('utf-8'): + hash.update(first_line) + hash.update(f.read()) + return hash.hexdigest() + + except: + raise + return None + finally: + if f: + f.close() + + except IOError: + return None + + def _hash_obj(self, ext): + from hashlib import sha1 + + sources = ext.sources + if (sources is None or + (not hasattr(sources, '__iter__')) or + isinstance(sources, str) or + sys.version[0] == 2 and isinstance(sources, unicode)): # argh + return False + + sources = list(sources) + ext.depends + hash = sha1() + try: + for fname in sources: + fhash = self._hash_file(fname) + if fhash: + hash.update(fhash.encode('utf-8')) + except: + return None + + return hash.hexdigest() + + +class CachingBuildExt(build_ext, CompilationCacheExtMixin): + def __init__(self, *args, **kwds): + CompilationCacheExtMixin.__init__(self, *args, **kwds) + kwds.pop("cache_dir", None) + build_ext.__init__(self, *args, **kwds) + + def build_extension(self, ext, *args, **kwds): + ext_path = self.get_ext_fullpath(ext.name) + build_path = os.path.join(self.build_lib, os.path.basename(ext_path)) + + hash = self._hash_obj(ext) + if hash and self._copy_from_cache(hash, ext_path): + return + + build_ext.build_extension(self, ext, *args, **kwds) + + hash = self._hash_obj(ext) + if os.path.exists(build_path): + self._put_to_cache(hash, build_path) # build_ext + if os.path.exists(ext_path): + self._put_to_cache(hash, ext_path) # develop + + def cython_sources(self, sources, extension): + import re + cplus = self.cython_cplus or getattr(extension, 'cython_cplus', 0) or \ + (extension.language and extension.language.lower() == 'c++') + target_ext = '.c' + if cplus: + target_ext = '.cpp' + + for i, s in enumerate(sources): + if not re.search("\.(pyx|pxi|pxd)$", s): + continue + ext_dir = os.path.dirname(s) + ext_basename = re.sub("\.[^\.]+$", "", os.path.basename(s)) + ext_basename += target_ext + target = os.path.join(ext_dir, ext_basename) + hash = self._hash_file(s) + sources[i] = target + if hash and self._copy_from_cache(hash, target): + continue + build_ext.cython_sources(self, [s], extension) + self._put_to_cache(hash, target) + + sources = [x for x in sources if x.startswith("pandas") or "lib." in x] + + return sources + +if BUILD_CACHE_DIR: # use the cache + cmdclass['build_ext'] = CachingBuildExt + +try: + # recent + setuptools_kwargs['use_2to3'] = True if BUILD_CACHE_DIR is None else False +except: + pass + +try: + # pre eb2234231 , ~ 0.7.0, + setuptools_args['use_2to3'] = True if BUILD_CACHE_DIR is None else False +except: + pass + +""" +def main(): + opd = os.path.dirname + opj = os.path.join + s= None + with open(opj(opd(__file__),"..","setup.py")) as f: + s = f.read() + if s: + if "BUILD CACHE ACTIVATED (V2)" in s: + print( "setup.py already wired with V2 build_cache, skipping..") + else: + SEP="\nsetup(" + before,after = s.split(SEP) + with open(opj(opd(__file__),"..","setup.py"),"wb") as f: + f.write((before + shim + SEP + after).encode('ascii')) + print(""" + setup.py was rewritten to use a build cache. + Make sure you've put the following in your .bashrc: + + export BUILD_CACHE_DIR= + echo $BUILD_CACHE_DIR > pandas_repo_rootdir/.build_cache_dir + + Once active, build results (compilation, cythonizations and 2to3 artifacts) + will be cached in "$BUILD_CACHE_DIR" and subsequent builds should be + sped up if no changes requiring recompilation were made. + + Go ahead and run: + + python setup.py clean + python setup.py develop + + """) + +if __name__ == '__main__': + import sys + sys.exit(main()) diff --git a/setup.py b/setup.py index d65e303758ee8..4ac36ccf425d8 100755 --- a/setup.py +++ b/setup.py @@ -11,23 +11,6 @@ import shutil import warnings -try: - basedir = os.path.dirname(__file__) - dotfile = os.path.join(basedir,".build_cache_dir") - BUILD_CACHE_DIR = "" - if os.path.exists(dotfile): - BUILD_CACHE_DIR = open(dotfile).readline().strip() - BUILD_CACHE_DIR = os.environ.get('BUILD_CACHE_DIR',BUILD_CACHE_DIR) - - if os.path.isdir(BUILD_CACHE_DIR): - print("--------------------------------------------------------") - print("BUILD CACHE ACTIVATED. be careful, this is experimental.") - print("--------------------------------------------------------") - else: - BUILD_CACHE_DIR = None -except: - BUILD_CACHE_DIR = None - # may need to work around setuptools bug by providing a fake Pyrex try: import Cython @@ -205,7 +188,7 @@ def build_extensions(self): MICRO = 0 ISRELEASED = True VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO) -QUALIFIER = 'rc1' +QUALIFIER = '' FULLVERSION = VERSION if not ISRELEASED: @@ -346,155 +329,6 @@ def build_extensions(self): build_ext.build_extensions(self) -class CompilationCacheMixin(object): - def __init__(self, *args, **kwds): - cache_dir = kwds.pop("cache_dir", BUILD_CACHE_DIR) - self.cache_dir = cache_dir - if not os.path.isdir(cache_dir): - raise Exception("Error: path to Cache directory (%s) is not a dir" % cache_dir) - - def _copy_from_cache(self, hash, target): - src = os.path.join(self.cache_dir, hash) - if os.path.exists(src): - # print("Cache HIT: asked to copy file %s in %s" % - # (src,os.path.abspath(target))) - s = "." - for d in target.split(os.path.sep)[:-1]: - s = os.path.join(s, d) - if not os.path.exists(s): - os.mkdir(s) - shutil.copyfile(src, target) - - return True - - return False - - def _put_to_cache(self, hash, src): - target = os.path.join(self.cache_dir, hash) - # print( "Cache miss: asked to copy file from %s to %s" % (src,target)) - s = "." - for d in target.split(os.path.sep)[:-1]: - s = os.path.join(s, d) - if not os.path.exists(s): - os.mkdir(s) - shutil.copyfile(src, target) - - def _hash_obj(self, obj): - """ - you should override this method to provide a sensible - implementation of hashing functions for your intended objects - """ - try: - return hash(obj) - except: - raise NotImplementedError("You must override this method") - -class CompilationCacheExtMixin(CompilationCacheMixin): - def __init__(self, *args, **kwds): - CompilationCacheMixin.__init__(self, *args, **kwds) - - def _hash_file(self, fname): - from hashlib import sha1 - f= None - try: - hash = sha1() - hash.update(self.build_lib.encode('utf-8')) - try: - if sys.version_info[0] >= 3: - import io - f = io.open(fname, "rb") - else: - f = open(fname) - - first_line = f.readline() - # ignore cython generation timestamp header - if "Generated by Cython" not in first_line.decode('utf-8'): - hash.update(first_line) - hash.update(f.read()) - return hash.hexdigest() - - except: - raise - return None - finally: - if f: - f.close() - - except IOError: - return None - - def _hash_obj(self, ext): - from hashlib import sha1 - - sources = ext.sources - if (sources is None or - (not hasattr(sources, '__iter__')) or - isinstance(sources, str) or - sys.version[0] == 2 and isinstance(sources, unicode)): # argh - return False - - sources = list(sources) + ext.depends - hash = sha1() - try: - for fname in sources: - fhash = self._hash_file(fname) - if fhash: - hash.update(fhash.encode('utf-8')) - except: - return None - - return hash.hexdigest() - - -class CachingBuildExt(build_ext, CompilationCacheExtMixin): - def __init__(self, *args, **kwds): - CompilationCacheExtMixin.__init__(self, *args, **kwds) - kwds.pop("cache_dir", None) - build_ext.__init__(self, *args, **kwds) - - def build_extension(self, ext, *args, **kwds): - ext_path = self.get_ext_fullpath(ext.name) - build_path = os.path.join(self.build_lib, os.path.basename(ext_path)) - - hash = self._hash_obj(ext) - if hash and self._copy_from_cache(hash, ext_path): - return - - build_ext.build_extension(self, ext, *args, **kwds) - - hash = self._hash_obj(ext) - if os.path.exists(build_path): - self._put_to_cache(hash, build_path) # build_ext - if os.path.exists(ext_path): - self._put_to_cache(hash, ext_path) # develop - - def cython_sources(self, sources, extension): - import re - cplus = self.cython_cplus or getattr(extension, 'cython_cplus', 0) or \ - (extension.language and extension.language.lower() == 'c++') - target_ext = '.c' - if cplus: - target_ext = '.cpp' - - for i, s in enumerate(sources): - if not re.search("\.(pyx|pxi|pxd)$", s): - continue - ext_dir = os.path.dirname(s) - ext_basename = re.sub("\.[^\.]+$", "", os.path.basename(s)) - ext_basename += target_ext - target = os.path.join(ext_dir, ext_basename) - hash = self._hash_file(s) - sources[i] = target - if hash and self._copy_from_cache(hash, target): - continue - build_ext.cython_sources(self, [s], extension) - self._put_to_cache(hash, target) - - sources = [x for x in sources if x.startswith("pandas")] - - return sources - - class CythonCommand(build_ext): """Custom distutils command subclassed from Cython.Distutils.build_ext to compile pyx->c, and stop there. All this does is override the @@ -524,8 +358,6 @@ def run(self): if cython: suffix = '.pyx' cmdclass['build_ext'] = CheckingBuildExt - if BUILD_CACHE_DIR: # use the cache - cmdclass['build_ext'] = CachingBuildExt cmdclass['cython'] = CythonCommand else: suffix = '.c' @@ -645,6 +477,10 @@ def pxd(name): setuptools_kwargs["test_suite"] = "nose.collector" write_version_py() + +# The build cache system does string matching below this point. +# if you change something, be careful. + setup(name=DISTNAME, version=FULLVERSION, maintainer=AUTHOR, diff --git a/tox.sh b/tox.sh new file mode 100755 index 0000000000000..b68ffc7fdb91c --- /dev/null +++ b/tox.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + + +if [ x"$1" == x"fast" ]; then + scripts/use_build_cache.py +fi; + +tox diff --git a/tox_prll.sh b/tox_prll.sh index 66311aaf1991e..910e49b6b5a80 100755 --- a/tox_prll.sh +++ b/tox_prll.sh @@ -12,6 +12,10 @@ ENVS=$(cat tox.ini | grep envlist | tr "," " " | cut -d " " -f 3-) TOX_INI_PAR="tox_prll.ini" +if [ x"$1" == x"fast" ]; then + scripts/use_build_cache.py +fi; + echo "[Creating distfile]" tox --sdistonly export DISTFILE="$(find .tox/dist -type f )" diff --git a/vb_suite/frame_methods.py b/vb_suite/frame_methods.py index 19b29d87f40b5..7745450e5c03b 100644 --- a/vb_suite/frame_methods.py +++ b/vb_suite/frame_methods.py @@ -150,17 +150,30 @@ def f(K=500): ## setup = common_setup + """ -from pandas.core.config import option_context +df = pandas.DataFrame(np.random.randn(10,10000)) +""" -def interactive_repr(frame): - with option_context('mode.sim_interactive', True): - repr(frame) +frame_repr_wide = Benchmark('repr(df)', setup, + start_date=datetime(2012, 8, 1)) -df = pandas.DataFrame(np.random.randn(10,10000)) +## +setup = common_setup + """ +df = pandas.DataFrame(np.random.randn(10000, 10)) """ - -frame_wide_repr = Benchmark('repr(df)', setup, + +frame_repr_tall = Benchmark('repr(df)', setup, start_date=datetime(2012, 8, 1)) -frame_wide_repr_interactive = Benchmark('interactive_repr(df)', setup, - start_date=datetime(2012, 8, 1)) +## +setup = common_setup + """ +df = DataFrame(randn(100000, 1)) +""" + +frame_xs_row = Benchmark('df.xs(50000)', setup) + +## +setup = common_setup + """ +df = DataFrame(randn(1,100000)) +""" + +frame_xs_col = Benchmark('df.xs(50000,axis = 1)', setup) diff --git a/vb_suite/test_perf.py b/vb_suite/test_perf.py index 74d5f457296ee..d019af3370ba9 100755 --- a/vb_suite/test_perf.py +++ b/vb_suite/test_perf.py @@ -108,6 +108,7 @@ type=int, help='set processor affinity of processm by default bind to cpu/core #1 only' 'requires the "affinity" python module , will raise Warning otherwise' ) + parser.add_argument('-u', '--burnin', metavar="u", dest='burnin', @@ -115,6 +116,16 @@ type=int, help='number of extra iteration per benchmark to perform first, then throw away. ' ) +parser.add_argument('-S', '--stats', + default=False, + action='store_true', + help='when specified with -N, prints s.describe() per vbench. ' ) + +parser.add_argument('-q', '--quiet', + default=False, + action='store_true', + help='suppress report output to stdout. ' ) + def get_results_df(db, rev): """Takes a git commit hash and returns a Dataframe of benchmark results """ @@ -261,6 +272,7 @@ def profile_head_single(benchmark): results.append(d.get('timing',np.nan)) gc.enable() + gc.collect() finally: gc.enable() @@ -280,6 +292,7 @@ def profile_head(benchmarks): print( "Performing %d benchmarks (%d runs each)" % ( len(benchmarks), args.hrepeats)) ss= [profile_head_single(b) for b in benchmarks] + print("\n") results = DataFrame(ss) results.columns=[ "#%d" %i for i in range(args.hrepeats)] @@ -289,54 +302,65 @@ def profile_head(benchmarks): shas, messages, _,_ = _parse_commit_log(None,REPO_PATH,base_commit="HEAD^") print_report(results,h_head=shas[-1],h_msg=messages[-1]) + if args.outdf: prprint("The results DataFrame was written to '%s'\n" % args.outdf) DataFrame(results).save(args.outdf) def print_report(df,h_head=None,h_msg="",h_baseline=None,b_msg=""): - name_width=45 - col_width = 10 - - hdr = ("{:%s}" % name_width).format("Test name") - hdr += ("|{:^%d}" % col_width)* len(df.columns) - hdr += "|" - hdr = hdr.format(*df.columns) - hdr = "-"*len(hdr) + "\n" + hdr + "\n" + "-"*len(hdr) + "\n" - ftr=hdr - s = "\n" - s+= "Invoked with :\n" - s+= "--ncalls: %s\n" % (args.ncalls or 'Auto') - s+= "--repeats: %s\n" % (args.repeats) - s+= "\n\n" - - s += hdr - # import ipdb - # ipdb.set_trace() - for i in range(len(df)): - lfmt = ("{:%s}" % name_width) - lfmt += ("| {:%d.4f} " % (col_width-2))* len(df.columns) - lfmt += "|\n" - s += lfmt.format(df.index[i],*list(df.irow(i).values)) - - s+= ftr + "\n" - - s += "Ratio < 1.0 means the target commit is faster then the baseline.\n" - s += "Seed used: %d\n\n" % args.seed - - if h_head: - s += 'Target [%s] : %s\n' % (h_head, h_msg) - if h_baseline: - s += 'Base [%s] : %s\n\n' % ( - h_baseline, b_msg) - - logfile = open(args.log_file, 'w') - logfile.write(s) - logfile.close() - + name_width=45 + col_width = 10 + + hdr = ("{:%s}" % name_width).format("Test name") + hdr += ("|{:^%d}" % col_width)* len(df.columns) + hdr += "|" + hdr = hdr.format(*df.columns) + hdr = "-"*len(hdr) + "\n" + hdr + "\n" + "-"*len(hdr) + "\n" + ftr=hdr + s = "\n" + s+= "Invoked with :\n" + s+= "--ncalls: %s\n" % (args.ncalls or 'Auto') + s+= "--repeats: %s\n" % (args.repeats) + s+= "\n\n" + + s += hdr + # import ipdb + # ipdb.set_trace() + for i in range(len(df)): + lfmt = ("{:%s}" % name_width) + lfmt += ("| {:%d.4f} " % (col_width-2))* len(df.columns) + lfmt += "|\n" + s += lfmt.format(df.index[i],*list(df.irow(i).values)) + + s+= ftr + "\n" + + s += "Ratio < 1.0 means the target commit is faster then the baseline.\n" + s += "Seed used: %d\n\n" % args.seed + + if h_head: + s += 'Target [%s] : %s\n' % (h_head, h_msg) + if h_baseline: + s += 'Base [%s] : %s\n\n' % ( + h_baseline, b_msg) + + stats_footer = "\n" + if args.stats : + stats_footer += str(df.T.describe().T) + "\n\n" + + s+= stats_footer + logfile = open(args.log_file, 'w') + logfile.write(s) + logfile.close() + + if not args.quiet: prprint(s) - prprint("Results were also written to the logfile at '%s'" % - args.log_file) + + if args.stats and args.quiet: + prprint(stats_footer) + + prprint("Results were also written to the logfile at '%s'" % + args.log_file)