Skip to content

Commit

Permalink
Added 'displayed_only' option to 'read_html' (pandas-dev#20047)
Browse files Browse the repository at this point in the history
  • Loading branch information
WillAyd authored and jreback committed Mar 10, 2018
1 parent ed96567 commit bd31f71
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 5 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,7 @@ Other Enhancements
- :meth:`Timestamp.day_name` and :meth:`DatetimeIndex.day_name` are now available to return day names with a specified locale (:issue:`12806`)
- :meth:`DataFrame.to_sql` now performs a multivalue insert if the underlying connection supports itk rather than inserting row by row.
``SQLAlchemy`` dialects supporting multivalue inserts include: ``mysql``, ``postgresql``, ``sqlite`` and any dialect with ``supports_multivalues_insert``. (:issue:`14315`, :issue:`8953`)
- :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`)

.. _whatsnew_0230.api_breaking:

Expand Down
71 changes: 66 additions & 5 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,14 @@ class _HtmlFrameParser(object):
attrs : dict
List of HTML <table> element attributes to match.
encoding : str
Encoding to be used by parser
displayed_only : bool
Whether or not items with "display:none" should be ignored
.. versionadded:: 0.23.0
Attributes
----------
io : str or file-like
Expand All @@ -172,6 +180,14 @@ class _HtmlFrameParser(object):
A dictionary of valid table attributes to use to search for table
elements.
encoding : str
Encoding to be used by parser
displayed_only : bool
Whether or not items with "display:none" should be ignored
.. versionadded:: 0.23.0
Notes
-----
To subclass this class effectively you must override the following methods:
Expand All @@ -187,11 +203,12 @@ class _HtmlFrameParser(object):
functionality.
"""

def __init__(self, io, match, attrs, encoding):
def __init__(self, io, match, attrs, encoding, displayed_only):
self.io = io
self.match = match
self.attrs = attrs
self.encoding = encoding
self.displayed_only = displayed_only

def parse_tables(self):
tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
Expand Down Expand Up @@ -380,6 +397,27 @@ def _parse_raw_tbody(self, table):
res = self._parse_tr(table)
return self._parse_raw_data(res)

def _handle_hidden_tables(self, tbl_list, attr_name):
"""Returns list of tables, potentially removing hidden elements
Parameters
----------
tbl_list : list of Tag or list of Element
Type of list elements will vary depending upon parser used
attr_name : str
Name of the accessor for retrieving HTML attributes
Returns
-------
list of Tag or list of Element
Return type matches `tbl_list`
"""
if not self.displayed_only:
return tbl_list

return [x for x in tbl_list if "display:none" not in
getattr(x, attr_name).get('style', '').replace(" ", "")]


class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
"""HTML to DataFrame parser that uses BeautifulSoup under the hood.
Expand Down Expand Up @@ -431,8 +469,14 @@ def _parse_tables(self, doc, match, attrs):

result = []
unique_tables = set()
tables = self._handle_hidden_tables(tables, "attrs")

for table in tables:
if self.displayed_only:
for elem in table.find_all(
style=re.compile(r"display:\s*none")):
elem.decompose()

if (table not in unique_tables and
table.find(text=match) is not None):
result.append(table)
Expand Down Expand Up @@ -528,6 +572,17 @@ def _parse_tables(self, doc, match, kwargs):

tables = doc.xpath(xpath_expr, namespaces=_re_namespace)

tables = self._handle_hidden_tables(tables, "attrib")
if self.displayed_only:
for table in tables:
# lxml utilizes XPATH 1.0 which does not have regex
# support. As a result, we find all elements with a style
# attribute and iterate them to check for display:none
for elem in table.xpath('.//*[@style]'):
if "display:none" in elem.attrib.get(
"style", "").replace(" ", ""):
elem.getparent().remove(elem)

if not tables:
raise ValueError("No tables found matching regex {patt!r}"
.format(patt=pattern))
Expand Down Expand Up @@ -729,15 +784,15 @@ def _validate_flavor(flavor):
return flavor


def _parse(flavor, io, match, attrs, encoding, **kwargs):
def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
flavor = _validate_flavor(flavor)
compiled_match = re.compile(match) # you can pass a compiled regex here

# hack around python 3 deleting the exception variable
retained = None
for flav in flavor:
parser = _parser_dispatch(flav)
p = parser(io, compiled_match, attrs, encoding)
p = parser(io, compiled_match, attrs, encoding, displayed_only)

try:
tables = p.parse_tables()
Expand Down Expand Up @@ -773,7 +828,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
skiprows=None, attrs=None, parse_dates=False,
tupleize_cols=None, thousands=',', encoding=None,
decimal='.', converters=None, na_values=None,
keep_default_na=True):
keep_default_na=True, displayed_only=True):
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
Parameters
Expand Down Expand Up @@ -877,6 +932,11 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
.. versionadded:: 0.19.0
display_only : bool, default True
Whether elements with "display: none" should be parsed
.. versionadded:: 0.23.0
Returns
-------
dfs : list of DataFrames
Expand Down Expand Up @@ -924,4 +984,5 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
parse_dates=parse_dates, tupleize_cols=tupleize_cols,
thousands=thousands, attrs=attrs, encoding=encoding,
decimal=decimal, converters=converters, na_values=na_values,
keep_default_na=keep_default_na)
keep_default_na=keep_default_na,
displayed_only=displayed_only)
66 changes: 66 additions & 0 deletions pandas/tests/io/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,39 @@ def test_wikipedia_states_table(self):
result = self.read_html(data, 'Arizona', header=1)[0]
assert result['sq mi'].dtype == np.dtype('float64')

@pytest.mark.parametrize("displayed_only,exp0,exp1", [
(True, DataFrame(["foo"]), None),
(False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"]))])
def test_displayed_only(self, displayed_only, exp0, exp1):
# GH 20027
data = StringIO("""<html>
<body>
<table>
<tr>
<td>
foo
<span style="display:none;text-align:center">bar</span>
<span style="display:none">baz</span>
<span style="display: none">qux</span>
</td>
</tr>
</table>
<table style="display: none">
<tr>
<td>foo</td>
</tr>
</table>
</body>
</html>""")

dfs = self.read_html(data, displayed_only=displayed_only)
tm.assert_frame_equal(dfs[0], exp0)

if exp1 is not None:
tm.assert_frame_equal(dfs[1], exp1)
else:
assert len(dfs) == 1 # Should not parse hidden table

def test_decimal_rows(self):

# GH 12907
Expand Down Expand Up @@ -896,6 +929,39 @@ def test_computer_sales_page(self):
data = os.path.join(DATA_PATH, 'computer_sales_page.html')
self.read_html(data, header=[0, 1])

@pytest.mark.parametrize("displayed_only,exp0,exp1", [
(True, DataFrame(["foo"]), None),
(False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"]))])
def test_displayed_only(self, displayed_only, exp0, exp1):
# GH 20027
data = StringIO("""<html>
<body>
<table>
<tr>
<td>
foo
<span style="display:none;text-align:center">bar</span>
<span style="display:none">baz</span>
<span style="display: none">qux</span>
</td>
</tr>
</table>
<table style="display: none">
<tr>
<td>foo</td>
</tr>
</table>
</body>
</html>""")

dfs = self.read_html(data, displayed_only=displayed_only)
tm.assert_frame_equal(dfs[0], exp0)

if exp1 is not None:
tm.assert_frame_equal(dfs[1], exp1)
else:
assert len(dfs) == 1 # Should not parse hidden table


def test_invalid_flavor():
url = 'google.com'
Expand Down

0 comments on commit bd31f71

Please sign in to comment.