Skip to content

Commit

Permalink
feat: Add example script about routing metadata to converters in inde…
Browse files Browse the repository at this point in the history
…xing pipelines (#6702)

* support single metadata dict in markdown2document

* reno

* unwrap list

* direct key access

* typing

* add example of indexing pipeline using Multiplexer

* reno
  • Loading branch information
ZanSara authored Jan 9, 2024
1 parent abd16ab commit 9fe80fd
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 0 deletions.
43 changes: 43 additions & 0 deletions examples/pipelines/indexing_pipeline_with_meta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from typing import Dict, Any
from pathlib import Path
from datetime import datetime

from haystack import Pipeline
from haystack.components.others import Multiplexer
from haystack.components.converters import PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.routers import FileTypeRouter, DocumentJoiner
from haystack.components.writers import DocumentWriter
from haystack.document_stores import InMemoryDocumentStore


document_store = InMemoryDocumentStore()

p = Pipeline()
p.add_component(instance=FileTypeRouter(mime_types=["text/plain", "application/pdf"]), name="file_type_router")
p.add_component(instance=Multiplexer(Dict[str, Any]), name="metadata_multiplexer")
p.add_component(instance=TextFileToDocument(), name="text_file_converter")
p.add_component(instance=PyPDFToDocument(), name="pdf_file_converter")
p.add_component(instance=DocumentJoiner(), name="joiner")
p.add_component(instance=DocumentCleaner(), name="cleaner")
p.add_component(instance=DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30), name="splitter")
p.add_component(instance=DocumentWriter(document_store=document_store), name="writer")

p.connect("file_type_router.text/plain", "text_file_converter.sources")
p.connect("file_type_router.application/pdf", "pdf_file_converter.sources")
p.connect("metadata_multiplexer", "text_file_converter.meta")
p.connect("metadata_multiplexer", "pdf_file_converter.meta")
p.connect("text_file_converter.documents", "joiner.documents")
p.connect("pdf_file_converter.documents", "joiner.documents")
p.connect("joiner.documents", "cleaner.documents")
p.connect("cleaner.documents", "splitter.documents")
p.connect("splitter.documents", "writer.documents")

result = p.run(
{
"file_type_router": {"sources": list(Path(".").iterdir())},
"metadata_multiplexer": {"value": {"date_added": datetime.now().isoformat()}},
}
)

assert all("date_added" in doc.meta for doc in document_store.filter_documents())
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
---
enhancements:
- Add example script about how to use Multiplexer to route meta to file converters.

0 comments on commit 9fe80fd

Please sign in to comment.