Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(megaparse): add sdk #3462

Merged
merged 2 commits into from
Nov 8, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions core/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,12 @@ dependencies = [
"transformers[sentencepiece]>=4.44.2",
"faiss-cpu>=1.8.0.post1",
"rapidfuzz>=3.10.1",
"megaparse-sdk>=0.1.2",
"markupsafe>=2.1.5",
]
readme = "README.md"
requires-python = ">= 3.11"

[project.optional-dependencies]
all = [
"unstructured[epub,docx,odt,doc,pptx,ppt,xlsx,md]>=0.15.5",
"docx2txt>=0.8",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
Expand Down
23 changes: 18 additions & 5 deletions core/quivr_core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,22 @@
from pydantic import BaseModel


class PdfParser(str, Enum):
LLAMA_PARSE = "llama_parse"
class ParserType(str, Enum):
"""Parser type enumeration."""

UNSTRUCTURED = "unstructured"
LLAMA_PARSER = "llama_parser"
MEGAPARSE_VISION = "megaparse_vision"


class StrategyEnum(str, Enum):
"""Method to use for the conversion"""

FAST = "fast"
AUTO = "auto"
HI_RES = "hi_res"


class MegaparseBaseConfig(BaseModel):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need to configure it from yaml ?

@classmethod
def from_yaml(cls, file_path: str):
Expand All @@ -22,6 +32,9 @@ def from_yaml(cls, file_path: str):


class MegaparseConfig(MegaparseBaseConfig):
strategy: str = "fast"
llama_parse_api_key: str | None = None
pdf_parser: PdfParser = PdfParser.UNSTRUCTURED
method: ParserType = ParserType.UNSTRUCTURED
strategy: StrategyEnum = StrategyEnum.AUTO
check_table: bool = False
parsing_instruction: str | None = None
model_name: str = "gpt-4o"
timeout: int = 60 # FIXME: Chloé not sure necessary
39 changes: 34 additions & 5 deletions core/quivr_core/processor/implementations/megaparse_processor.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import logging
import os

import tiktoken
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
from megaparse import MegaParse
from megaparse_sdk import MegaParseSDK

from quivr_core.config import MegaparseConfig
from quivr_core.files.file import QuivrFile
Expand All @@ -29,7 +30,24 @@ class MegaparseProcessor(ProcessorBase):

"""

supported_extensions = [FileExtension.pdf]
supported_extensions = [
FileExtension.pdf,
FileExtension.docx,
FileExtension.doc,
FileExtension.pptx,
FileExtension.xls,
FileExtension.xlsx,
FileExtension.csv,
FileExtension.epub,
FileExtension.bib,
FileExtension.odt,
FileExtension.html,
FileExtension.py,
FileExtension.markdown,
FileExtension.md,
FileExtension.mdx,
FileExtension.ipynb,
]

def __init__(
self,
Expand All @@ -56,9 +74,20 @@ def processor_metadata(self):
}

async def process_file_inner(self, file: QuivrFile) -> list[Document]:
mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config) # type: ignore
document: Document = await mega_parse.aload()
if len(document.page_content) > self.splitter_config.chunk_size:
api_key = str(os.getenv("MEGAPARSE_API_KEY"))
megaparse = MegaParseSDK(api_key)
logger.info(f"Uploading file {file.path} to MegaParse")
response = await megaparse.file.upload(
file_path=file.path,
method="unstructured", # type: ignore # unstructured, llama_parser, megaparse_vision
strategy="auto",
)
logger.info(f"Parsed file : {response}")

document = Document(
page_content=response,
)
if len(response) > self.splitter_config.chunk_size:
docs = self.text_splitter.split_documents([document])
for doc in docs:
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
Expand Down
83 changes: 52 additions & 31 deletions core/quivr_core/processor/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import types
from dataclasses import dataclass, field
from heapq import heappop, heappush
from typing import Type, TypeAlias
from typing import List, Type, TypeAlias

from quivr_core.files.file import FileExtension

Expand Down Expand Up @@ -49,37 +49,41 @@ class ProcEntry:

def _append_proc_mapping(
mapping: ProcMapping,
file_ext: FileExtension | str,
file_exts: List[FileExtension] | List[str],
cls_mod: str,
errtxt: str,
priority: int | None,
):
if file_ext in mapping:
try:
prev_proc = heappop(mapping[file_ext])
proc_entry = ProcEntry(
priority=priority if priority is not None else prev_proc.priority - 1,
cls_mod=cls_mod,
err=errtxt,
)
# Push the previous processor back
heappush(mapping[file_ext], prev_proc)
heappush(mapping[file_ext], proc_entry)
except IndexError:
for file_ext in file_exts:
if file_ext in mapping:
try:
prev_proc = heappop(mapping[file_ext])
proc_entry = ProcEntry(
priority=priority
if priority is not None
else prev_proc.priority - 1,
cls_mod=cls_mod,
err=errtxt,
)
# Push the previous processor back
heappush(mapping[file_ext], prev_proc)
heappush(mapping[file_ext], proc_entry)
except IndexError:
proc_entry = ProcEntry(
priority=priority if priority is not None else _LOWEST_PRIORITY,
cls_mod=cls_mod,
err=errtxt,
)
heappush(mapping[file_ext], proc_entry)

else:
proc_entry = ProcEntry(
priority=priority if priority is not None else _LOWEST_PRIORITY,
cls_mod=cls_mod,
err=errtxt,
)
heappush(mapping[file_ext], proc_entry)

else:
proc_entry = ProcEntry(
priority=priority if priority is not None else _LOWEST_PRIORITY,
cls_mod=cls_mod,
err=errtxt,
)
mapping[file_ext] = [proc_entry]
mapping[file_ext] = [proc_entry]


def defaults_to_proc_entries(
Expand Down Expand Up @@ -109,21 +113,38 @@ def defaults_to_proc_entries(
ext_str = ext.value if isinstance(ext, FileExtension) else ext
_append_proc_mapping(
mapping=base_processors,
file_ext=ext,
file_exts=[ext],
cls_mod=f"quivr_core.processor.implementations.default.{processor_name}",
errtxt=f"can't import {processor_name}. Please install quivr-core[{ext_str}] to access {processor_name}",
priority=None,
)

# TODO(@aminediro): Megaparse should register itself
# Append Megaparse
# _append_proc_mapping(
# mapping=base_processors,
# file_ext=FileExtension.pdf,
# cls_mod="quivr_core.processor.implementations.megaparse_processor.MegaparseProcessor",
# errtxt=f"can't import MegaparseProcessor. Please install quivr-core[{ext_str}] to access MegaparseProcessor",
# priority=None,
# )
_append_proc_mapping(
mapping=base_processors,
file_exts=[
FileExtension.pdf,
FileExtension.docx,
FileExtension.doc,
FileExtension.pptx,
FileExtension.xls,
FileExtension.xlsx,
FileExtension.csv,
FileExtension.epub,
FileExtension.bib,
FileExtension.odt,
FileExtension.html,
FileExtension.py,
FileExtension.markdown,
FileExtension.md,
FileExtension.mdx,
FileExtension.ipynb,
],
cls_mod="quivr_core.processor.implementations.megaparse_processor.MegaparseProcessor",
errtxt=f"can't import MegaparseProcessor. Please install quivr-core[{ext_str}] to access MegaparseProcessor",
priority=None,
)
return base_processors


Expand Down Expand Up @@ -181,7 +202,7 @@ def register_processor(
if all(proc_cls != proc.cls_mod for proc in known_processors[file_ext]):
_append_proc_mapping(
known_processors,
file_ext=file_ext,
file_exts=[file_ext],
cls_mod=proc_cls,
errtxt=errtxt
or f"{proc_cls} import failed for processor of {file_ext}",
Expand Down
51 changes: 0 additions & 51 deletions core/tests/processor/pdf/test_megaparse_pdf_processor.py

This file was deleted.

44 changes: 44 additions & 0 deletions examples/simple_question_megaparse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from quivr_core import Brain
from quivr_core.llm.llm_endpoint import LLMEndpoint
from quivr_core.rag.entities.config import LLMEndpointConfig
from rich.console import Console
from rich.panel import Panel
from rich.prompt import Prompt

if __name__ == "__main__":
brain = Brain.from_files(
name="test_brain",
file_paths=["./tests/processor/docx/demo.docx"],
llm=LLMEndpoint(
llm_config=LLMEndpointConfig(model="gpt-4o"),
llm=ChatOpenAI(model="gpt-4o", api_key=str(os.getenv("OPENAI_API_KEY"))),
),
)
embedder = embeddings = OpenAIEmbeddings(
model="text-embedding-3-large",
)
# Check brain info
brain.print_info()

console = Console()
console.print(Panel.fit("Ask your brain !", style="bold magenta"))

while True:
# Get user input
question = Prompt.ask("[bold cyan]Question[/bold cyan]")

# Check if user wants to exit
if question.lower() == "exit":
console.print(Panel("Goodbye!", style="bold yellow"))
break

answer = brain.ask(question)
# Print the answer with typing effect
console.print(f"[bold green]Quivr Assistant[/bold green]: {answer.answer}")

console.print("-" * console.width)

brain.print_info()
Loading