You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
from langchain.docstore.document import Document
from langchain.text_splitter import MarkdownTextSplitter
# of course this is part of a larger markdown document, but this is the minimal string to reproduce
txt = "\n\n***\n\n"
doc = Document(page_content=txt)
markdown_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0)
splitted = markdown_splitter.split_documents([doc])
Traceback (most recent call last):
File "t.py", line 9, in <module>
splitted = markdown_splitter.split_documents([doc])
File "/home/richard/.local/lib/python3.8/site-packages/langchain/text_splitter.py", line 101, in split_documents
return self.create_documents(texts, metadatas=metadatas)
File "/home/richard/.local/lib/python3.8/site-packages/langchain/text_splitter.py", line 88, in create_documents
for chunk in self.split_text(text):
File "/home/richard/.local/lib/python3.8/site-packages/langchain/text_splitter.py", line 369, in split_text
return self._split_text(text, self._separators)
File "/home/richard/.local/lib/python3.8/site-packages/langchain/text_splitter.py", line 346, in _split_text
splits = _split_text(text, separator, self._keep_separator)
File "/home/richard/.local/lib/python3.8/site-packages/langchain/text_splitter.py", line 37, in _split_text
_splits = re.split(f"({separator})", text)
File "/usr/lib/python3.8/re.py", line 231, in split
return _compile(pattern, flags).split(string, maxsplit)
File "/usr/lib/python3.8/re.py", line 304, in _compile
p = sre_compile.compile(pattern, flags)
File "/usr/lib/python3.8/sre_compile.py", line 764, in compile
p = sre_parse.parse(p, flags)
File "/usr/lib/python3.8/sre_parse.py", line 948, in parse
p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
File "/usr/lib/python3.8/sre_parse.py", line 443, in _parse_sub
itemsappend(_parse(source, state, verbose, nested + 1,
File "/usr/lib/python3.8/sre_parse.py", line 834, in _parse
p = _parse_sub(source, state, sub_verbose, nested + 1)
File "/usr/lib/python3.8/sre_parse.py", line 443, in _parse_sub
itemsappend(_parse(source, state, verbose, nested + 1,
File "/usr/lib/python3.8/sre_parse.py", line 671, in _parse
raise source.error("multiple repeat",
re.error: multiple repeat at position 4 (line 3, column 2)
Expected behavior
splitted contains splitted markdown and no errors occur
The text was updated successfully, but these errors were encountered:
Fixes#5614
#### Issue
The `***` combination produces an exception when used as a seperator in
`re.split`. Instead `\*\*\*` should be used for regex exprations.
#### Who can review?
@eyurtsev
Fixeslangchain-ai#5614
#### Issue
The `***` combination produces an exception when used as a seperator in
`re.split`. Instead `\*\*\*` should be used for regex exprations.
#### Who can review?
@eyurtsev
System Info
langchain 0.0.188
python 3.8.10
Who can help?
No response
Information
Related Components
Reproduction
Expected behavior
splitted contains splitted markdown and no errors occur
The text was updated successfully, but these errors were encountered: