diff --git a/core/quivr_core/files/file.py b/core/quivr_core/files/file.py index 9f4089b103fb..0a778f176567 100644 --- a/core/quivr_core/files/file.py +++ b/core/quivr_core/files/file.py @@ -112,9 +112,9 @@ def __init__( id: UUID, original_filename: str, path: Path, - brain_id: UUID, file_sha1: str, file_extension: FileExtension | str, + brain_id: UUID | None = None, file_size: int | None = None, metadata: dict[str, Any] | None = None, ) -> None: diff --git a/core/quivr_core/processor/implementations/megaparse_processor.py b/core/quivr_core/processor/implementations/megaparse_processor.py index 255a2f885da7..cd73d57dee53 100644 --- a/core/quivr_core/processor/implementations/megaparse_processor.py +++ b/core/quivr_core/processor/implementations/megaparse_processor.py @@ -3,6 +3,7 @@ import tiktoken from langchain_core.documents import Document from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter +from megaparse import MegaParse from quivr_core.config import MegaparseConfig from quivr_core.files.file import QuivrFile @@ -55,14 +56,11 @@ def processor_metadata(self): } async def process_file_inner(self, file: QuivrFile) -> list[Document]: - # mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config) # type: ignore - # document: Document = await mega_parse.aload() - # if len(document.page_content) > self.splitter_config.chunk_size: - # docs = self.text_splitter.split_documents([document]) - # for doc in docs: - # # if "Production Fonts (maximum)" in doc.page_content: - # # print('Doc: ', doc.page_content) - # doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))} - # return docs - # return [document] - return [] + mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config) # type: ignore + document: Document = await mega_parse.aload() + if len(document.page_content) > self.splitter_config.chunk_size: + docs = self.text_splitter.split_documents([document]) + for doc in docs: + doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))} + return docs + return [document] diff --git a/core/quivr_core/processor/implementations/tika_processor.py b/core/quivr_core/processor/implementations/tika_processor.py index 11c6f798118e..0ba9f13fa23b 100644 --- a/core/quivr_core/processor/implementations/tika_processor.py +++ b/core/quivr_core/processor/implementations/tika_processor.py @@ -1,4 +1,5 @@ import logging +import os from typing import AsyncIterable import httpx @@ -28,7 +29,7 @@ class TikaProcessor(ProcessorBase): def __init__( self, - tika_url: str = "http://localhost:9998/tika", + tika_url: str = os.getenv("TIKA_SERVER_URL", "http://localhost:9998/tika"), splitter: TextSplitter | None = None, splitter_config: SplitterConfig = SplitterConfig(), timeout: float = 5.0, diff --git a/core/quivr_core/processor/processor_base.py b/core/quivr_core/processor/processor_base.py index 1b8cbbe39423..5d41902b940b 100644 --- a/core/quivr_core/processor/processor_base.py +++ b/core/quivr_core/processor/processor_base.py @@ -2,7 +2,6 @@ from abc import ABC, abstractmethod from importlib.metadata import PackageNotFoundError, version from typing import Any -from uuid import uuid4 from langchain_core.documents import Document @@ -43,7 +42,6 @@ async def process_file(self, file: QuivrFile) -> list[Document]: "utf-8" ) doc.metadata = { - "id": uuid4(), "chunk_index": idx, "quivr_core_version": qvr_version, **file.metadata, diff --git a/core/quivr_core/processor/registry.py b/core/quivr_core/processor/registry.py index 697268a1e9c9..22fee5a1ba95 100644 --- a/core/quivr_core/processor/registry.py +++ b/core/quivr_core/processor/registry.py @@ -117,13 +117,13 @@ def defaults_to_proc_entries( # TODO(@aminediro): Megaparse should register itself # Append Megaparse - _append_proc_mapping( - mapping=base_processors, - file_ext=FileExtension.pdf, - cls_mod="quivr_core.processor.implementations.megaparse_processor.MegaparseProcessor", - errtxt=f"can't import MegaparseProcessor. Please install quivr-core[{ext_str}] to access MegaparseProcessor", - priority=None, - ) + # _append_proc_mapping( + # mapping=base_processors, + # file_ext=FileExtension.pdf, + # cls_mod="quivr_core.processor.implementations.megaparse_processor.MegaparseProcessor", + # errtxt=f"can't import MegaparseProcessor. Please install quivr-core[{ext_str}] to access MegaparseProcessor", + # priority=None, + # ) return base_processors diff --git a/core/quivr_core/rag/entities/models.py b/core/quivr_core/rag/entities/models.py index f87f49b66f2a..e4e7867dcdfa 100644 --- a/core/quivr_core/rag/entities/models.py +++ b/core/quivr_core/rag/entities/models.py @@ -39,10 +39,11 @@ class ChatMessage(BaseModelV1): class KnowledgeStatus(str, Enum): - PROCESSING = "PROCESSING" - UPLOADED = "UPLOADED" ERROR = "ERROR" RESERVED = "RESERVED" + PROCESSING = "PROCESSING" + PROCESSED = "PROCESSED" + UPLOADED = "UPLOADED" class Source(BaseModel): diff --git a/core/quivr_core/rag/quivr_rag_langgraph.py b/core/quivr_core/rag/quivr_rag_langgraph.py index 49ba6d28276a..e61e912d18fc 100644 --- a/core/quivr_core/rag/quivr_rag_langgraph.py +++ b/core/quivr_core/rag/quivr_rag_langgraph.py @@ -1,20 +1,20 @@ +import asyncio import logging from typing import ( Annotated, + Any, AsyncGenerator, + Dict, List, Optional, Sequence, Tuple, - TypedDict, - Dict, - Any, Type, + TypedDict, ) from uuid import uuid4 -import asyncio -# TODO(@aminediro): this is the only dependency to langchain package, we should remove it +import openai from langchain.retrievers import ContextualCompressionRetriever from langchain_cohere import CohereRerank from langchain_community.document_compressors import JinaRerank @@ -22,20 +22,17 @@ from langchain_core.documents import BaseDocumentCompressor, Document from langchain_core.messages import BaseMessage from langchain_core.messages.ai import AIMessageChunk -from langchain_core.vectorstores import VectorStore from langchain_core.prompts.base import BasePromptTemplate -from langgraph.graph import START, END, StateGraph +from langchain_core.vectorstores import VectorStore +from langgraph.graph import END, START, StateGraph from langgraph.graph.message import add_messages from langgraph.types import Send - - from pydantic import BaseModel, Field -import openai -from quivr_core.rag.entities.chat import ChatHistory -from quivr_core.rag.entities.config import DefaultRerankers, NodeConfig, RetrievalConfig from quivr_core.llm import LLMEndpoint from quivr_core.llm_tools.llm_tools import LLMToolFactory +from quivr_core.rag.entities.chat import ChatHistory +from quivr_core.rag.entities.config import DefaultRerankers, NodeConfig, RetrievalConfig from quivr_core.rag.entities.models import ( ParsedRAGChunkResponse, QuivrKnowledge,