Quansight · pmeier · Jan 28, 2024 · Jan 23, 2024 · Jan 23, 2024 · Jan 23, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -60,6 +60,7 @@ all = [
     "pyarrow",
     "pymupdf>=1.23.6",
     "python-docx",
+    "python-pptx",
     "tiktoken",
 ]
 
@@ -143,6 +144,7 @@ module = [
     "param",
     "pyarrow",
     "docx",
+    "pptx",
     "sentence_transformers",
 ]
 ignore_missing_imports = true

diff --git a/ragna/core/__init__.py b/ragna/core/__init__.py
@@ -6,6 +6,7 @@
     "DocumentHandler",
     "DocumentUploadParameters",
     "DocxDocumentHandler",
+    "PptxDocumentHandler",
     "EnvVarRequirement",
     "LocalDocument",
     "Message",
@@ -39,6 +40,7 @@
     Page,
     PdfDocumentHandler,
     PlainTextDocumentHandler,
+    PptxDocumentHandler,
 )
 
 # isort: split

diff --git a/ragna/core/_document.py b/ragna/core/_document.py
@@ -301,3 +301,33 @@ def extract_pages(self, document: Document) -> Iterator[Page]:
             text = paragraph.text
             if len(text) > 0:
                 yield Page(text=text)
+
+
+@DOCUMENT_HANDLERS.load_if_available
+class PptxDocumentHandler(DocumentHandler):
+    """Document handler for `.pptx` documents.
+
+    !!! info "Package requirements"
+
+        - [`python-pptx`](https://github.com/scanny/python-pptx)
+    """
+
+    @classmethod
+    def requirements(cls) -> list[Requirement]:
+        return [PackageRequirement("python-pptx")]
+
+    @classmethod
+    def supported_suffixes(cls) -> list[str]:
+        return [".pptx"]
+
+    def extract_pages(self, document: Document) -> Iterator[Page]:
+        import pptx
+
+        document_pptx = pptx.Presentation(io.BytesIO(document.read()))
+        for number, slide in enumerate(document_pptx.slides, 1):
+            text = "\n\n".join(
+                shape.text
+                for shape in slide.shapes
+                if shape.has_text_frame and shape.text
+            )
+            yield Page(text=text, number=number)
diff --git a/tests/core/test_document.py b/tests/core/test_document.py
@@ -1,6 +1,7 @@
 import docx
+import pptx
 
-from ragna.core import DocxDocumentHandler, LocalDocument
+from ragna.core import DocxDocumentHandler, LocalDocument, PptxDocumentHandler
 
 
 def get_docx_document(tmp_path, docx_text):
@@ -20,3 +21,24 @@ def test_docx(tmp_path):
     assert len(pages) == 2
     for page in pages:
         assert page.text == docx_text
+
+
+def get_pptx_document(tmp_path, pptx_text):
+    document = pptx.Presentation()
+    document.slides.add_slide(document.slide_layouts[0])
+    document.slides[0].shapes.title.text = pptx_text
+    document.slides.add_slide(document.slide_layouts[0])
+    document.slides[1].shapes.add_textbox(0, 0, 100, 100).text = pptx_text
+    path = tmp_path / "test_document.pptx"
+    document.save(path)
+    return LocalDocument.from_path(path)
+
+
+def test_pptx(tmp_path):
+    pptx_text = "ragna is neat!"
+    tmp_pptx_document = get_pptx_document(tmp_path, pptx_text)
+    assert isinstance(tmp_pptx_document.handler, PptxDocumentHandler)
+    pages = list(tmp_pptx_document.extract_pages())
+    assert len(pages) == 2
+    for page in pages:
+        assert page.text == pptx_text