Skip to content

Commit

Permalink
fix: handle mimetypes with encodings
Browse files Browse the repository at this point in the history
As reported in #112 and demonstrated in the test (prior to fixing) we
didn't handle mimetypes correctly if they contained an encoding. For
instance `text/html; charset=utf-8`. The fix is to split the string on
the semicolon, and only take the mimetype (appearing before the first
semicolon) for the matching.

This closes #112.
  • Loading branch information
bjchambers committed Mar 25, 2024
1 parent 1393251 commit b9d8d7c
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 1 deletion.
2 changes: 2 additions & 0 deletions dewy/common/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ async def extract_content(
logger.debug("Inferred mime type '{}' from path '{}'", mimetype, filename)
if encoding is not None:
raise ValueError(f"Unsupported encoding: '{encoding}'")
else:
mimetype = mimetype.split(";", 2)[0]

match mimetype:
case "application/pdf":
Expand Down
9 changes: 8 additions & 1 deletion tests/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from filetype.types.document import Docx

from dewy.common.extract import extract_content
from dewy.common.extract import extract_content, extract_url
from tests.conftest import NEARLY_EMPTY_PATH, NEARLY_EMPTY_TEXT, TEST_DATA_DIR

NEARLY_EMPTY_MD_PATH = os.path.join(TEST_DATA_DIR, "nearly_empty.md")
Expand Down Expand Up @@ -50,6 +50,13 @@ async def test_extract_content_html_mimetype():
assert result.text == NEARLY_EMPTY_HTML_TEXT


async def test_extract_content_html_url():
result = await extract_url(
"https://python.langchain.com/docs/expression_language/cookbook/retrieval"
)
assert "retrieval-augmented generation" in result.text


async def test_extract_content_html_extension():
content = None
with open(NEARLY_EMPTY_HTML_PATH, "rb") as input_file:
Expand Down

0 comments on commit b9d8d7c

Please sign in to comment.