Skip to content

Commit

Permalink
update (#77)
Browse files Browse the repository at this point in the history
配置修改,代码更新
  • Loading branch information
yaojin3616 authored Oct 16, 2023
2 parents ebeed02 + b2cc2db commit 3c90a82
Show file tree
Hide file tree
Showing 22 changed files with 443 additions and 33 deletions.
14 changes: 4 additions & 10 deletions docker/bisheng/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,14 @@ chains:
CombineDocsChain:
documentation: ""
documentloaders:
ElemUnstructuredLoaderV0:
documentation: ""
AirbyteJSONLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/airbyte_json"
CoNLLULoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/conll-u"
CSVLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/csv"
UnstructuredEmailLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/email"
EverNoteLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/evernote"
FacebookChatLoader:
Expand All @@ -106,24 +106,16 @@ documentloaders:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/gutenberg"
BSHTMLLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/html"
UnstructuredHTMLLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/html"
UnstructuredMarkdownLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/markdown"
PyPDFDirectoryLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/pdf"
PyPDFLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/pdf"
UnstructuredPowerPointLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/microsoft_powerpoint"
SRTLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/subtitle"
TelegramChatLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/telegram"
TextLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/"
UnstructuredWordDocumentLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/microsoft_word"
WebBaseLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/web_base"
AZLyricsLoader:
Expand Down Expand Up @@ -334,6 +326,8 @@ utilities:
WolframAlphaAPIWrapper:
documentation: ""
retrievers:
MixEsVectorRetriever:
documentation: ""
MultiQueryRetriever:
documentation: "https://python.langchain.com/docs/modules/data_connection/retrievers/how_to/MultiQueryRetriever"
# https://github.com/supabase-community/supabase-py/issues/482
Expand Down
2 changes: 1 addition & 1 deletion docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ services:
command: redis-server /etc/redis.conf

mysql:
image: mysql:5.7.40
image: mysql:8.0
environment:
- "MYSQL_ROOT_PASSWORD=1234" # 数据库密码,建议修改,如果修改需要同步修改bisheng/congfig/config.yaml配置
- "MYSQL_DATABASE=bisheng"
Expand Down
7 changes: 3 additions & 4 deletions docker/mysql/conf/my.cnf
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@ default-character-set=utf8mb4
default-character-set=utf8mb4

[mysqld]
init_connect='SET collation_connection = utf8mb4_unicode_ci'
init_connect='SET NAMES utf8mb4'
init_connect='SET collation_connection = utf8mb4_unicode_ci, NAMES utf8mb4'
character-set-server=utf8mb4
collation-server=utf8mb4_unicode_ci
skip-character-set-client-handshake
sql_mode=STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_AUTO_CREATE_USER,NO_ENGINE_SUBSTITUTION
# skip-character-set-client-handshake
sql_mode=STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION
2 changes: 1 addition & 1 deletion src/backend/bisheng/chat/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,7 @@ async def process_source_document(self, source_document: List[Document], chat_id

from bisheng.settings import settings
# 使用大模型进行关键词抽取,模型配置临时方案
keyword_conf = settings.default_llm.get('keyword_llm')
keyword_conf = settings.default_llm
host_base_url = keyword_conf.get('host_base_url')
model = keyword_conf.get('model')

Expand Down
10 changes: 0 additions & 10 deletions src/backend/bisheng/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,6 @@ documentloaders:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/conll-u"
CSVLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/csv"
UnstructuredEmailLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/email"
EverNoteLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/evernote"
FacebookChatLoader:
Expand All @@ -102,24 +100,16 @@ documentloaders:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/gutenberg"
BSHTMLLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/html"
UnstructuredHTMLLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/html"
UnstructuredMarkdownLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/markdown"
PyPDFDirectoryLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/pdf"
PyPDFLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/pdf"
UnstructuredPowerPointLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/microsoft_powerpoint"
SRTLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/subtitle"
TelegramChatLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/telegram"
TextLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/"
UnstructuredWordDocumentLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/microsoft_word"
WebBaseLoader:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/web_base"
AZLyricsLoader:
Expand Down
2 changes: 2 additions & 0 deletions src/backend/bisheng/main.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import time
from pathlib import Path
from typing import Optional

Expand Down Expand Up @@ -87,6 +88,7 @@ def setup_app(static_files_dir: Optional[Path] = None) -> FastAPI:


configure(log_level='DEBUG', log_file='./data/bisheng.log')
time.sleep(20)
app = create_app()

if __name__ == '__main__':
Expand Down
2 changes: 1 addition & 1 deletion src/bisheng-langchain/README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
## What is bisheng-langchain?

bisheng-langchain is an open-source langchain extending library built to power building LLM application.
bisheng-langchain provides more components to support Chinese LLMs and and Chinese based token enviroments for prompt engineering and ICL template.
bisheng-langchain provides more components to support Chinese LLMs and and Chinese based token environments for prompt engineering and ICL template.


The project is a sub-module of [bisheng](https://github.com/dataelement/bisheng).
Expand Down
7 changes: 4 additions & 3 deletions src/bisheng-langchain/bisheng_langchain/chains/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from bisheng_langchain.chains.combine_documents.stuff import StuffDocumentsChain

__all__ = [
'StuffDocumentsChain',
]
from .loader_output import LoaderOutputChain

__all__ = ['StuffDocumentsChain', 'LoaderOutputChain']

68 changes: 68 additions & 0 deletions src/bisheng-langchain/bisheng_langchain/chains/loader_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""Chain that runs an arbitrary python function."""
import functools
import logging
import json
from typing import Any, Awaitable, Callable, Dict, List, Optional

from langchain.callbacks.manager import (
AsyncCallbackManagerForChainRun,
CallbackManagerForChainRun,
)
from langchain.chains.base import Chain
from langchain.docstore.document import Document

logger = logging.getLogger(__name__)


class LoaderOutputChain(Chain):
"""Chain that print the loader output.
"""
documents: List[Document]
input_key: str = "begin" #: :meta private:
output_key: str = "text" #: :meta private:

@staticmethod
@functools.lru_cache
def _log_once(msg: str) -> None:
"""Log a message once.
:meta private:
"""
logger.warning(msg)

@property
def input_keys(self) -> List[str]:
"""Expect input keys.
:meta private:
"""
return [self.input_key]

@property
def output_keys(self) -> List[str]:
"""Return output keys.
:meta private:
"""
return [self.output_key]

def _call(
self,
inputs: Dict[str, str],
run_manager: Optional[CallbackManagerForChainRun] = None,
) -> Dict[str, str]:
contents = [doc.page_content for doc in self.documents]
contents = '\n\n'.join(contents)
# contents = json.dumps(contents, indent=2, ensure_ascii=False)
output = {self.output_key: contents}
return output

async def _acall(
self,
inputs: Dict[str, Any],
run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
contents = [doc.page_content for doc in self.documents]
contents = json.dumps(contents, indent=2, ensure_ascii=False)
output = {self.output_key: contents}
return output
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .host_llm import HostBaichuanChat, HostChatGLM2, HostLlama2Chat, HostQwenChat
from .host_llm import CustomLLMChat, HostBaichuanChat, HostChatGLM2, HostLlama2Chat, HostQwenChat
from .minimax import ChatMinimaxAI
from .proxy_llm import ProxyChatLLM
from .wenxin import ChatWenxin
Expand All @@ -8,5 +8,6 @@
__all__ = [
'ProxyChatLLM', 'ChatMinimaxAI', 'ChatWenxin', 'ChatZhipuAI',
'ChatXunfeiAI',
'HostChatGLM2', 'HostBaichuanChat', 'HostLlama2Chat', 'HostQwenChat'
'HostChatGLM2', 'HostBaichuanChat', 'HostLlama2Chat', 'HostQwenChat',
'CustomLLMChat'
]
41 changes: 41 additions & 0 deletions src/bisheng-langchain/bisheng_langchain/chat_models/host_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,3 +409,44 @@ class HostLlama2Chat(BaseHostChatLLM):
def _llm_type(self) -> str:
"""Return type of chat model."""
return 'llama2_chat'


class CustomLLMChat(BaseHostChatLLM):
# use custom llm chat api, api should compatiable with openai definition
model_name: str = Field('custom-llm-chat', alias='model')

temperature: float = 0.1
top_p: float = 0.1
max_tokens: int = 8192

@property
def _llm_type(self) -> str:
"""Return type of chat model."""
return 'custom_llm_chat'

def completion_with_retry(self, **kwargs: Any) -> Any:
retry_decorator = _create_retry_decorator(self)

@retry_decorator
def _completion_with_retry(**kwargs: Any) -> Any:
messages = kwargs.get('messages')
temperature = kwargs.get('temperature')
top_p = kwargs.get('top_p')
max_tokens = kwargs.get('max_tokens')
do_sample = kwargs.get('do_sample')
params = {
'messages': messages,
'model': self.model_name,
'top_p': top_p,
'temperature': temperature,
'max_tokens': max_tokens,
'do_sample': do_sample
}

if self.verbose:
print('payload', params)

resp = self.client(url=self.host_base_url, json=params).json()
return resp

return _completion_with_retry(**kwargs)
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from .elem_pdf import PDFWithSemanticLoader

from .elem_unstrcutured_loader import ElemUnstructuredLoader, ElemUnstructuredLoaderV0

__all__ = ['PDFWithSemanticLoader', 'ElemUnstructuredLoader', 'ElemUnstructuredLoaderV0']
__all__ = ['PDFWithSemanticLoader', 'ElemUnstructuredLoader', 'ElemUnstructuredLoaderV0', 'UniversalKVLoader']

Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
from .image import LayoutParser
from .ocr_client import OCRClient
from .ellm_client import ELLMClient

__all__ = [
'LayoutParser',
'OCRClient',
'ELLMClient'
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# import base64
import copy
import base64
import requests
from typing import Any, Iterator, List, Mapping, Optional, Union


class ELLMClient(object):
def __init__(self,
api_base_url: Optional[str] = None):
self.ep = api_base_url
self.client = requests.Session()
self.timeout = 10000
self.params = {
'sort_filter_boxes': True,
'enable_huarong_box_adjust': True,
'support_long_image_segment': True,
'checkbox': ['std_checkbox'],
'rotateupright': True
}

self.scene_mapping = {
'doc': {
'det': 'general_text_det_mrcnn_v1.0',
'recog': 'transformer-v2.8-gamma-faster',
'ellm': 'ELLM'
},
'form': {
'det': 'mrcnn-v5.1',
'recog': 'transformer-v2.8-gamma-faster',
'ellm': 'ELLM'
},
'hand': {
'det': 'mrcnn-v5.1',
'recog': 'transformer-hand-v1.16-faster',
'ellm': 'ELLM'
}
}

def predict(self, inp):
scene = inp.pop('scene', 'form')
b64_image = inp.pop('b64_image')
ellm_schema = inp.pop('keys')
params = copy.deepcopy(self.params)
params.update(self.scene_mapping[scene])
params.update({'ellm_schema': ellm_schema})

req_data = {'data': [b64_image], 'param': params}

try:
r = self.client.post(url=self.ep,
json=req_data,
timeout=self.timeout)
return r.json()
except Exception as e:
return {'status_code': 400, 'status_message': str(e)}
Loading

0 comments on commit 3c90a82

Please sign in to comment.