Skip to content

Commit

Permalink
Attribute support for html tags (#5782)
Browse files Browse the repository at this point in the history
# What does this PR do?

Change the HTML tags so that a tag with attributes can be found.

## Before submitting

- [x] Tests added
- [x] CI/CD validated

### Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
  • Loading branch information
Xmaster6y authored Jun 6, 2023
1 parent 0cfaa76 commit 65111eb
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 26 deletions.
52 changes: 26 additions & 26 deletions langchain/text_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -740,33 +740,33 @@ def get_separators_for_language(language: Language) -> List[str]:
elif language == Language.HTML:
return [
# First, try to split along HTML tags
"<body>",
"<div>",
"<p>",
"<br>",
"<li>",
"<h1>",
"<h2>",
"<h3>",
"<h4>",
"<h5>",
"<h6>",
"<span>",
"<table>",
"<tr>",
"<td>",
"<th>",
"<ul>",
"<ol>",
"<header>",
"<footer>",
"<nav>",
"<body",
"<div",
"<p",
"<br",
"<li",
"<h1",
"<h2",
"<h3",
"<h4",
"<h5",
"<h6",
"<span",
"<table",
"<tr",
"<td",
"<th",
"<ul",
"<ol",
"<header",
"<footer",
"<nav",
# Head
"<head>",
"<style>",
"<script>",
"<meta>",
"<title>",
"<head",
"<style",
"<script",
"<meta",
"<title",
"",
]
else:
Expand Down
36 changes: 36 additions & 0 deletions tests/unit_tests/test_text_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,3 +576,39 @@ def test_markdown_code_splitter() -> None:
"block",
"```",
]


def test_html_code_splitter() -> None:
splitter = RecursiveCharacterTextSplitter.from_language(
Language.HTML, chunk_size=60, chunk_overlap=0
)
code = """
<h1>Sample Document</h1>
<h2>Section</h2>
<p id="1234">Reference content.</p>
<h2>Lists</h2>
<ul>
<li>Item 1</li>
<li>Item 2</li>
<li>Item 3</li>
</ul>
<h3>A block</h3>
<div class="amazing">
<p>Some text</p>
<p>Some more text</p>
</div>
"""
chunks = splitter.split_text(code)
assert chunks == [
"<h1>Sample Document</h1>\n <h2>Section</h2>",
'<p id="1234">Reference content.</p>',
"<h2>Lists</h2>\n <ul>",
"<li>Item 1</li>\n <li>Item 2</li>",
"<li>Item 3</li>\n </ul>",
"<h3>A block</h3>",
'<div class="amazing">',
"<p>Some text</p>",
"<p>Some more text</p>\n </div>",
]

0 comments on commit 65111eb

Please sign in to comment.