Skip to content

Commit

Permalink
fix(python): Replace spaces with   to support showing multiple s…
Browse files Browse the repository at this point in the history
…paces in HTML repr (pola-rs#19783)
  • Loading branch information
TNieuwdorp authored Nov 15, 2024
1 parent e59626d commit 5515b2c
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 1 deletion.
4 changes: 3 additions & 1 deletion py-polars/polars/dataframe/_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,9 @@ def write_body(self) -> None:
else:
series = self.df[:, c]
self.elements.append(
html.escape(series._s.get_fmt(r, str_len_limit))
html.escape(
series._s.get_fmt(r, str_len_limit)
).replace(" ", " ")
)

def write(self, inner: str) -> None:
Expand Down
29 changes: 29 additions & 0 deletions py-polars/tests/unit/dataframe/test_repr_html.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

import polars as pl


Expand Down Expand Up @@ -77,3 +79,30 @@ def test_series_repr_html_max_rows_default() -> None:

expected_rows = 10
assert html.count("<td>") - 2 == expected_rows


def test_html_representation_multiple_spaces() -> None:
df = pl.DataFrame(
{"string_col": ["multiple spaces", " trailing and leading "]}
)
html_repr = df._repr_html_()

# Regex explanation:
# Matches cell content inside <td>...</td> tags, but only within the <tbody> section
# 1. <tbody>: Ensures matching starts within the <tbody> section.
# 2. .*?: Lazily matches any content until the first <td> tag.
# 3. <td>(.*?)</td>: Captures the content inside each <td> tag (non-greedy).
# 4. .*?: Lazily matches any content between <td>...</td> and </tbody>.
# 5. </tbody>: Ensures matching ends at the closing </tbody> tag.
# The re.S flag allows the regex to work across multiple lines.
cell_pattern = re.compile(r"<tbody>.*?<td>(.*?)</td>.*?</tbody>", re.S)

cells = cell_pattern.findall(html_repr)

for cell_content in cells:
# Check that there are no regular spaces in the content
assert " " not in cell_content, f"Unexpected space in cell: {cell_content}"
# Check that the content contains &nbsp; as required
assert (
"&nbsp;" in cell_content
), f"Expected &nbsp; in cell but found: {cell_content}"

0 comments on commit 5515b2c

Please sign in to comment.