Skip to content

Commit

Permalink
cr修改,单测完善
Browse files Browse the repository at this point in the history
  • Loading branch information
HuiDBK committed Jul 22, 2024
1 parent 79334de commit 758acf8
Show file tree
Hide file tree
Showing 18 changed files with 372 additions and 179 deletions.
4 changes: 4 additions & 0 deletions config/config2.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,7 @@ iflytek_api_key: "YOUR_API_KEY"
iflytek_api_secret: "YOUR_API_SECRET"

metagpt_tti_url: "YOUR_MODEL_URL"

omniparse:
api_key: "YOUR_API_KEY"
base_url: "YOUR_BASE_URL"
6 changes: 1 addition & 5 deletions config/config2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,4 @@ llm:
api_type: "openai" # or azure / ollama / groq etc.
model: "gpt-4-turbo" # or gpt-3.5-turbo
base_url: "https://api.openai.com/v1" # or forward url / other llm url
api_key: "xxxx"

omniparse:
api_key: "your_api_key"
base_url: "http://192.168.50.126:8000"
api_key: "YOUR_API_KEY"
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
30 changes: 12 additions & 18 deletions examples/rag/omniparse_client.py → examples/rag/omniparse.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
import asyncio

from llama_parse import ResultType

from metagpt.config2 import config
from metagpt.logs import logger
from metagpt.rag.parser.omniparse.client import OmniParseClient
from metagpt.rag.parser.omniparse.parse import OmniParse
from metagpt.rag.schema import OmniParseOptions, OmniParseType
from metagpt.const import EXAMPLE_DATA_PATH

TEST_DOCX = EXAMPLE_DATA_PATH / "parse/test01.docx"
TEST_PDF = EXAMPLE_DATA_PATH / "parse/test02.pdf"
TEST_VIDEO = EXAMPLE_DATA_PATH / "parse/test03.mp4"
TEST_AUDIO = EXAMPLE_DATA_PATH / "parse/test04.mp3"
from metagpt.logs import logger
from metagpt.rag.parser import OmniParse
from metagpt.rag.schema import OmniParseOptions, OmniParseType, ParseResultType
from metagpt.utils.omniparse_client import OmniParseClient

TEST_DOCX = EXAMPLE_DATA_PATH / "omniparse/test01.docx"
TEST_PDF = EXAMPLE_DATA_PATH / "omniparse/test02.pdf"
TEST_VIDEO = EXAMPLE_DATA_PATH / "omniparse/test03.mp4"
TEST_AUDIO = EXAMPLE_DATA_PATH / "omniparse/test04.mp3"
TEST_WEBSITE_URL = "https://github.com/geekan/MetaGPT"


Expand All @@ -37,21 +35,17 @@ async def omniparse_client_example():
audio_parse_ret = await client.parse_audio(filelike=TEST_AUDIO)
logger.info(audio_parse_ret)

# website fixme:omniparse官方api还存在问题
# website_parse_ret = await client.parse_website(url=TEST_WEBSITE_URL)
# logger.info(website_parse_ret)


async def omniparse_example():
parser = OmniParse(
api_key=config.omniparse.api_key,
base_url=config.omniparse.base_url,
parse_options=OmniParseOptions(
parse_type=OmniParseType.PDF,
result_type=ResultType.MD,
result_type=ParseResultType.MD,
max_timeout=120,
num_workers=3,
)
),
)
ret = parser.load_data(file_path=TEST_PDF)
logger.info(ret)
Expand All @@ -67,5 +61,5 @@ async def main():
await omniparse_example()


if __name__ == '__main__':
if __name__ == "__main__":
asyncio.run(main())
3 changes: 2 additions & 1 deletion metagpt/config2.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
from pydantic import BaseModel, model_validator

from metagpt.configs.browser_config import BrowserConfig
from metagpt.configs.embedding_config import EmbeddingConfig, OmniParseConfig
from metagpt.configs.embedding_config import EmbeddingConfig
from metagpt.configs.file_parser_config import OmniParseConfig
from metagpt.configs.llm_config import LLMConfig, LLMType
from metagpt.configs.mermaid_config import MermaidConfig
from metagpt.configs.redis_config import RedisConfig
Expand Down
5 changes: 0 additions & 5 deletions metagpt/configs/embedding_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,3 @@ def check_api_type(cls, v):
if v == "":
return None
return v


class OmniParseConfig(YamlModel):
api_key: str = ""
base_url: str = ""
6 changes: 6 additions & 0 deletions metagpt/configs/file_parser_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from metagpt.utils.yaml_model import YamlModel


class OmniParseConfig(YamlModel):
api_key: str = ""
base_url: str = ""
51 changes: 35 additions & 16 deletions metagpt/rag/engines/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.postprocessor.types import BaseNodePostprocessor
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.readers.base import BaseReader
from llama_index.core.response_synthesizers import (
BaseSynthesizer,
get_response_synthesizer,
Expand All @@ -27,7 +28,6 @@
QueryType,
TransformComponent,
)
from llama_parse import ResultType

from metagpt.config2 import config
from metagpt.rag.factories import (
Expand All @@ -38,15 +38,18 @@
get_retriever,
)
from metagpt.rag.interface import NoEmbedding, RAGObject
from metagpt.rag.parser.omniparse.parse import OmniParse
from metagpt.rag.parser import OmniParse
from metagpt.rag.retrievers.base import ModifiableRAGRetriever, PersistableRAGRetriever
from metagpt.rag.retrievers.hybrid_retriever import SimpleHybridRetriever
from metagpt.rag.schema import (
BaseIndexConfig,
BaseRankerConfig,
BaseRetrieverConfig,
BM25RetrieverConfig,
ObjectNode, OmniParseOptions, OmniParseType,
ObjectNode,
OmniParseOptions,
OmniParseType,
ParseResultType,
)
from metagpt.utils.common import import_class

Expand Down Expand Up @@ -76,18 +79,6 @@ def __init__(
)
self._transformations = transformations or self._default_transformations()

@classmethod
def get_file_extractor(cls, file_type: str):
if not config.omniparse.base_url:
return
parser = OmniParse(
api_key=config.omniparse.api_key,
base_url=config.omniparse.base_url,
parse_options=OmniParseOptions(parse_type=OmniParseType.PDF, result_type=ResultType.MD)
)
file_extractor = {file_type: parser}
return file_extractor

@classmethod
def from_docs(
cls,
Expand Down Expand Up @@ -115,7 +106,7 @@ def from_docs(
if not input_dir and not input_files:
raise ValueError("Must provide either `input_dir` or `input_files`.")

file_extractor = cls.get_file_extractor(file_type=".pdf")
file_extractor = cls._get_file_extractor(file_type=".pdf")
documents = SimpleDirectoryReader(
input_dir=input_dir, input_files=input_files, file_extractor=file_extractor
).load_data()
Expand Down Expand Up @@ -319,3 +310,31 @@ def _resolve_embed_model(embed_model: BaseEmbedding = None, configs: list[Any] =
@staticmethod
def _default_transformations():
return [SentenceSplitter()]

@staticmethod
def _get_file_extractor(file_type: str = None) -> dict[str:BaseReader]:
"""
Get the file extractor for a specified file type.
If no file type is provided, return all available extractors.
Currently, only OmniParse PDF extraction is supported.
Args:
file_type: The type of file for which the extractor is needed. Defaults to None.
Returns:
dict[file_type: BaseReader]
"""
file_extractor_mapping: dict[str:BaseReader] = {}
if config.omniparse.base_url:
pdf_parser = OmniParse(
api_key=config.omniparse.api_key,
base_url=config.omniparse.base_url,
parse_options=OmniParseOptions(parse_type=OmniParseType.PDF, result_type=ParseResultType.MD),
)
file_extractor_mapping[".pdf"] = pdf_parser

if file_type:
file_extractor = file_extractor_mapping.get(file_type)
return {file_type: file_extractor} if file_extractor else {}

return file_extractor_mapping
3 changes: 3 additions & 0 deletions metagpt/rag/parser/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from metagpt.rag.parser.omniparse import OmniParse

__all__ = ["OmniParse"]
Original file line number Diff line number Diff line change
@@ -1,28 +1,31 @@
import asyncio
from fileinput import FileInput
from pathlib import Path
from typing import List, Union, Optional
from typing import List, Optional, Union

from llama_index.core import Document
from llama_index.core.async_utils import run_jobs
from llama_index.core.readers.base import BaseReader
from llama_parse import ResultType

from metagpt.rag.parser.omniparse.client import OmniParseClient
from metagpt.rag.schema import OmniParseOptions, OmniParseType
from metagpt.logs import logger
from metagpt.rag.schema import OmniParseOptions, OmniParseType
from metagpt.utils.async_helper import NestAsyncio
from metagpt.utils.omniparse_client import OmniParseClient


class OmniParse(BaseReader):
"""OmniParse"""

def __init__(
self,
api_key=None,
base_url="http://localhost:8000",
parse_options: OmniParseOptions = None
self, api_key: str = None, base_url: str = "http://localhost:8000", parse_options: OmniParseOptions = None
):
"""
Args:
api_key: Default None, can be used for authentication later.
base_url: OmniParse Base URL for the API.
parse_options: Optional settings for OmniParse. Default is OmniParseOptions with default values.
"""
self.parse_options = parse_options or OmniParseOptions()
self.omniparse_client = OmniParseClient(api_key, base_url, max_timeout=self.parse_options.max_timeout)

Expand All @@ -47,20 +50,32 @@ def result_type(self, result_type: Union[str, ResultType]):
self.parse_options.result_type = result_type

async def _aload_data(
self,
file_path: Union[str, bytes, Path],
extra_info: Optional[dict] = None,
self,
file_path: Union[str, bytes, Path],
extra_info: Optional[dict] = None,
) -> List[Document]:
"""
Load data from the input file_path.
Args:
file_path: File path or file byte data.
extra_info: Optional dictionary containing additional information.
Returns:
List[Document]
"""
try:
if self.parse_type == OmniParseType.PDF:
# 目前先只支持pdf解析
# pdf parse
parsed_result = await self.omniparse_client.parse_pdf(file_path)
else:
# other parse use omniparse_client.parse_document
# For compatible byte data, additional filename is required
extra_info = extra_info or {}
filename = extra_info.get("filename") # 兼容字节数据要额外传filename
filename = extra_info.get("filename")
parsed_result = await self.omniparse_client.parse_document(file_path, bytes_filename=filename)

# 获取指定的结构数据
# Get the specified structured data based on result_type
content = getattr(parsed_result, self.result_type)
docs = [
Document(
Expand All @@ -75,26 +90,51 @@ async def _aload_data(
return docs

async def aload_data(
self,
file_path: Union[List[FileInput], FileInput],
extra_info: Optional[dict] = None,
self,
file_path: Union[List[FileInput], FileInput],
extra_info: Optional[dict] = None,
) -> List[Document]:
"""
Load data from the input file_path.
Args:
file_path: File path or file byte data.
extra_info: Optional dictionary containing additional information.
Notes:
This method ultimately calls _aload_data for processing.
Returns:
List[Document]
"""
docs = []
if isinstance(file_path, (str, bytes, Path)):
# 处理单个
# Processing single file
docs = await self._aload_data(file_path, extra_info)
elif isinstance(file_path, list):
# 并发处理多个
# Concurrently process multiple files
parse_jobs = [self._aload_data(file_item, extra_info) for file_item in file_path]
doc_ret_list = await run_jobs(jobs=parse_jobs, workers=self.parse_options.num_workers)
docs = [doc for docs in doc_ret_list for doc in docs]
return docs

def load_data(
self,
file_path: Union[List[FileInput], FileInput],
extra_info: Optional[dict] = None,
self,
file_path: Union[List[FileInput], FileInput],
extra_info: Optional[dict] = None,
) -> List[Document]:
"""Load data from the input path."""
NestAsyncio.apply_once() # 兼容异步嵌套调用
"""
Load data from the input file_path.
Args:
file_path: File path or file byte data.
extra_info: Optional dictionary containing additional information.
Notes:
This method ultimately calls aload_data for processing.
Returns:
List[Document]
"""
NestAsyncio.apply_once() # Ensure compatibility with nested async calls
return asyncio.run(self.aload_data(file_path, extra_info))
2 changes: 0 additions & 2 deletions metagpt/rag/parser/omniparse/__init__.py

This file was deleted.

19 changes: 13 additions & 6 deletions metagpt/rag/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from llama_index.core.indices.base import BaseIndex
from llama_index.core.schema import TextNode
from llama_index.core.vector_stores.types import VectorStoreQueryMode
from llama_parse import ResultType
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr, model_validator

from metagpt.config2 import config
Expand Down Expand Up @@ -218,23 +217,31 @@ def get_obj_metadata(obj: RAGObject) -> dict:


class OmniParseType(str, Enum):
"""OmniParse解析类型"""
"""OmniParseType"""

PDF = "PDF"
DOCUMENT = "DOCUMENT"


class ParseResultType(str, Enum):
"""The result type for the parser."""

TXT = "text"
MD = "markdown"
JSON = "json"


class OmniParseOptions(BaseModel):
"""OmniParse可选配置"""

result_type: ResultType = Field(default=ResultType.MD, description="OmniParse解析返回的结果类型")
parse_type: OmniParseType = Field(default=OmniParseType.DOCUMENT, description="OmniParse解析类型,默认文档类型")
max_timeout: Optional[int] = Field(default=120, description="OmniParse服务请求最大超时")
result_type: ParseResultType = Field(default=ParseResultType.MD, description="OmniParse result_type")
parse_type: OmniParseType = Field(default=OmniParseType.DOCUMENT, description="OmniParse parse_type")
max_timeout: Optional[int] = Field(default=120, description="Maximum timeout for OmniParse service requests")
num_workers: int = Field(
default=5,
gt=0,
lt=10,
description="多文件列表时并发请求数量",
description="Number of concurrent requests for multiple files",
)


Expand Down
Loading

0 comments on commit 758acf8

Please sign in to comment.