Merge branch 'cpcloud_read_html'

* cpcloud_read_html: DOC: update RELEASE.rst ENH: add ability to read html tables directly into DataFrames
pandas-dev · May 3, 2013 · 6518c79 · 6518c79
2 parents a6fed22 + 702dbf8
commit 6518c79
Show file tree

Hide file tree

Showing 12 changed files with 7,179 additions and 9 deletions.
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -30,7 +30,8 @@ pandas 0.11.1
 
 **New features**
 
-  -
+  - pd.read_html() can now parse HTML string, files or urls and return dataframes
+    courtesy of @cpcloud. (GH3477_)
 
 **Improvements to existing features**
 
@@ -88,6 +89,7 @@ pandas 0.11.1
 .. _GH3437: https://github.com/pydata/pandas/issues/3437
 .. _GH3455: https://github.com/pydata/pandas/issues/3455
 .. _GH3457: https://github.com/pydata/pandas/issues/3457
+.. _GH3477: https://github.com/pydata/pandas/issues/3457
 .. _GH3461: https://github.com/pydata/pandas/issues/3461
 .. _GH3468: https://github.com/pydata/pandas/issues/3468
 .. _GH3448: https://github.com/pydata/pandas/issues/3448

diff --git a/ci/install.sh b/ci/install.sh
@@ -75,6 +75,8 @@ if ( ! $VENV_FILE_AVAILABLE ); then
         pip install $PIP_ARGS xlrd>=0.9.0
         pip install $PIP_ARGS 'http://downloads.sourceforge.net/project/pytseries/scikits.timeseries/0.91.3/scikits.timeseries-0.91.3.tar.gz?r='
         pip install $PIP_ARGS patsy
+        pip install $PIP_ARGS lxml
+        pip install $PIP_ARGS beautifulsoup4
 
         # fool statsmodels into thinking pandas was already installed
         # so it won't refuse to install itself. We want it in the zipped venv

diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -50,6 +50,13 @@ File IO
    read_csv
    ExcelFile.parse
 
+.. currentmodule:: pandas.io.html
+
+.. autosummary::
+   :toctree: generated/
+
+   read_html
+
 HDFStore: PyTables (HDF5)
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 .. currentmodule:: pandas.io.pytables

diff --git a/doc/source/install.rst b/doc/source/install.rst
@@ -99,6 +99,12 @@ Optional Dependencies
   * `openpyxl <http://packages.python.org/openpyxl/>`__, `xlrd/xlwt <http://www.python-excel.org/>`__
      * openpyxl version 1.6.1 or higher
      * Needed for Excel I/O
+  * `lxml <http://lxml.de>`__, or `Beautiful Soup 4 <http://www.crummy.com/software/BeautifulSoup>`__: for reading HTML tables
+     * The differences between lxml and Beautiful Soup 4 are mostly speed (lxml
+       is faster), however sometimes Beautiful Soup returns what you might
+       intuitively expect. Both backends are implemented, so try them both to
+       see which one you like. They should return very similar results.
+     * Note that lxml requires Cython to build successfully
 
 .. note::
 

diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt
@@ -12,9 +12,12 @@ API changes
 
 Enhancements
 ~~~~~~~~~~~~
+  - pd.read_html() can now parse HTML string, files or urls and return dataframes
+    courtesy of @cpcloud. (GH3477_)
 
 See the `full release notes
 <https://github.com/pydata/pandas/blob/master/RELEASE.rst>`__ or issue tracker
 on GitHub for a complete list.
 
 .. _GH2437: https://github.com/pydata/pandas/issues/2437
+.. _GH3477: https://github.com/pydata/pandas/issues/3477
diff --git a/pandas/__init__.py b/pandas/__init__.py
@@ -33,6 +33,7 @@
                                read_fwf, to_clipboard, ExcelFile,
                                ExcelWriter)
 from pandas.io.pytables import HDFStore, Term, get_store, read_hdf
+from pandas.io.html import read_html
 from pandas.util.testing import debug
 
 from pandas.tools.describe import value_range