diff --git a/examples/pipelines/indexing_pipeline_with_meta.py b/examples/pipelines/indexing_pipeline_with_meta.py new file mode 100644 index 0000000000..01990564d6 --- /dev/null +++ b/examples/pipelines/indexing_pipeline_with_meta.py @@ -0,0 +1,43 @@ +from typing import Dict, Any +from pathlib import Path +from datetime import datetime + +from haystack import Pipeline +from haystack.components.others import Multiplexer +from haystack.components.converters import PyPDFToDocument, TextFileToDocument +from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter +from haystack.components.routers import FileTypeRouter, DocumentJoiner +from haystack.components.writers import DocumentWriter +from haystack.document_stores import InMemoryDocumentStore + + +document_store = InMemoryDocumentStore() + +p = Pipeline() +p.add_component(instance=FileTypeRouter(mime_types=["text/plain", "application/pdf"]), name="file_type_router") +p.add_component(instance=Multiplexer(Dict[str, Any]), name="metadata_multiplexer") +p.add_component(instance=TextFileToDocument(), name="text_file_converter") +p.add_component(instance=PyPDFToDocument(), name="pdf_file_converter") +p.add_component(instance=DocumentJoiner(), name="joiner") +p.add_component(instance=DocumentCleaner(), name="cleaner") +p.add_component(instance=DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30), name="splitter") +p.add_component(instance=DocumentWriter(document_store=document_store), name="writer") + +p.connect("file_type_router.text/plain", "text_file_converter.sources") +p.connect("file_type_router.application/pdf", "pdf_file_converter.sources") +p.connect("metadata_multiplexer", "text_file_converter.meta") +p.connect("metadata_multiplexer", "pdf_file_converter.meta") +p.connect("text_file_converter.documents", "joiner.documents") +p.connect("pdf_file_converter.documents", "joiner.documents") +p.connect("joiner.documents", "cleaner.documents") +p.connect("cleaner.documents", "splitter.documents") +p.connect("splitter.documents", "writer.documents") + +result = p.run( + { + "file_type_router": {"sources": list(Path(".").iterdir())}, + "metadata_multiplexer": {"value": {"date_added": datetime.now().isoformat()}}, + } +) + +assert all("date_added" in doc.meta for doc in document_store.filter_documents()) diff --git a/releasenotes/notes/route-meta-to-converters-ed85acc43b5aa96a.yaml b/releasenotes/notes/route-meta-to-converters-ed85acc43b5aa96a.yaml new file mode 100644 index 0000000000..73f67d99b4 --- /dev/null +++ b/releasenotes/notes/route-meta-to-converters-ed85acc43b5aa96a.yaml @@ -0,0 +1,3 @@ +--- +enhancements: + - Add example script about how to use Multiplexer to route meta to file converters.