update (#77)

配置修改，代码更新
dataelement · Oct 16, 2023 · 3c90a82 · 3c90a82
2 parents ebeed02 + b2cc2db
commit 3c90a82
Show file tree

Hide file tree

Showing 22 changed files with 443 additions and 33 deletions.
diff --git a/docker/bisheng/config/config.yaml b/docker/bisheng/config/config.yaml
@@ -90,14 +90,14 @@ chains:
   CombineDocsChain:
     documentation: ""
 documentloaders:
+  ElemUnstructuredLoaderV0:
+    documentation: ""
   AirbyteJSONLoader:
     documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/airbyte_json"
   CoNLLULoader:
     documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/conll-u"
   CSVLoader:
     documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/csv"
-  UnstructuredEmailLoader:
-    documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/email"
   EverNoteLoader:
     documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/evernote"
   FacebookChatLoader:
@@ -106,24 +106,16 @@ documentloaders:
     documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/gutenberg"
   BSHTMLLoader:
     documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/html"
-  UnstructuredHTMLLoader:
-    documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/html"
-  UnstructuredMarkdownLoader:
-    documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/markdown"
   PyPDFDirectoryLoader:
     documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/pdf"
   PyPDFLoader:
     documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/pdf"
-  UnstructuredPowerPointLoader:
-    documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/microsoft_powerpoint"
   SRTLoader:
     documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/subtitle"
   TelegramChatLoader:
     documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/telegram"
   TextLoader:
     documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/"
-  UnstructuredWordDocumentLoader:
-    documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/microsoft_word"
   WebBaseLoader:
     documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/web_base"
   AZLyricsLoader:
@@ -334,6 +326,8 @@ utilities:
   WolframAlphaAPIWrapper:
     documentation: ""
 retrievers:
+  MixEsVectorRetriever:
+    documentation: ""
   MultiQueryRetriever:
     documentation: "https://python.langchain.com/docs/modules/data_connection/retrievers/how_to/MultiQueryRetriever"
   # https://github.com/supabase-community/supabase-py/issues/482

diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
@@ -12,7 +12,7 @@ services:
     command: redis-server /etc/redis.conf
 
   mysql:
-    image: mysql:5.7.40
+    image: mysql:8.0
     environment:
       - "MYSQL_ROOT_PASSWORD=1234"  # 数据库密码，建议修改，如果修改需要同步修改bisheng/congfig/config.yaml配置
       - "MYSQL_DATABASE=bisheng"

diff --git a/docker/mysql/conf/my.cnf b/docker/mysql/conf/my.cnf
@@ -5,9 +5,8 @@ default-character-set=utf8mb4
 default-character-set=utf8mb4
 
 [mysqld]
-init_connect='SET collation_connection = utf8mb4_unicode_ci'
-init_connect='SET NAMES utf8mb4'
+init_connect='SET collation_connection = utf8mb4_unicode_ci, NAMES utf8mb4'
 character-set-server=utf8mb4
 collation-server=utf8mb4_unicode_ci
-skip-character-set-client-handshake
-sql_mode=STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_AUTO_CREATE_USER,NO_ENGINE_SUBSTITUTION
+# skip-character-set-client-handshake
+sql_mode=STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION
diff --git a/src/backend/bisheng/chat/manager.py b/src/backend/bisheng/chat/manager.py
@@ -485,7 +485,7 @@ async def process_source_document(self, source_document: List[Document], chat_id
 
         from bisheng.settings import settings
         # 使用大模型进行关键词抽取，模型配置临时方案
-        keyword_conf = settings.default_llm.get('keyword_llm')
+        keyword_conf = settings.default_llm
         host_base_url = keyword_conf.get('host_base_url')
         model = keyword_conf.get('model')
 

diff --git a/src/backend/bisheng/config.yaml b/src/backend/bisheng/config.yaml
@@ -92,8 +92,6 @@ documentloaders:
     documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/conll-u"
   CSVLoader:
     documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/csv"
-  UnstructuredEmailLoader:
-    documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/email"
   EverNoteLoader:
     documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/evernote"
   FacebookChatLoader:
@@ -102,24 +100,16 @@ documentloaders:
     documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/gutenberg"
   BSHTMLLoader:
     documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/html"
-  UnstructuredHTMLLoader:
-    documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/html"
-  UnstructuredMarkdownLoader:
-    documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/markdown"
   PyPDFDirectoryLoader:
     documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/pdf"
   PyPDFLoader:
     documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/pdf"
-  UnstructuredPowerPointLoader:
-    documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/microsoft_powerpoint"
   SRTLoader:
     documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/subtitle"
   TelegramChatLoader:
     documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/telegram"
   TextLoader:
     documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/"
-  UnstructuredWordDocumentLoader:
-    documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/microsoft_word"
   WebBaseLoader:
     documentation: "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/web_base"
   AZLyricsLoader:

diff --git a/src/backend/bisheng/main.py b/src/backend/bisheng/main.py
@@ -1,3 +1,4 @@
+import time
 from pathlib import Path
 from typing import Optional
 
@@ -87,6 +88,7 @@ def setup_app(static_files_dir: Optional[Path] = None) -> FastAPI:
 
 
 configure(log_level='DEBUG', log_file='./data/bisheng.log')
+time.sleep(20)
 app = create_app()
 
 if __name__ == '__main__':

diff --git a/src/bisheng-langchain/README.md b/src/bisheng-langchain/README.md
@@ -1,7 +1,7 @@
 ## What is bisheng-langchain?
 
 bisheng-langchain is an open-source langchain extending library built to power building LLM application.
-bisheng-langchain provides more components to support Chinese LLMs and and Chinese based token enviroments for prompt engineering and ICL template.
+bisheng-langchain provides more components to support Chinese LLMs and and Chinese based token environments for prompt engineering and ICL template.
 
 
 The project is a sub-module of [bisheng](https://github.com/dataelement/bisheng).

diff --git a/src/bisheng-langchain/bisheng_langchain/chains/__init__.py b/src/bisheng-langchain/bisheng_langchain/chains/__init__.py
@@ -1,5 +1,6 @@
 from bisheng_langchain.chains.combine_documents.stuff import StuffDocumentsChain
 
-__all__ = [
-    'StuffDocumentsChain',
-]
+from .loader_output import LoaderOutputChain
+
+__all__ = ['StuffDocumentsChain', 'LoaderOutputChain']
+
diff --git a/src/bisheng-langchain/bisheng_langchain/chains/loader_output.py b/src/bisheng-langchain/bisheng_langchain/chains/loader_output.py
@@ -0,0 +1,68 @@
+"""Chain that runs an arbitrary python function."""
+import functools
+import logging
+import json
+from typing import Any, Awaitable, Callable, Dict, List, Optional
+
+from langchain.callbacks.manager import (
+    AsyncCallbackManagerForChainRun,
+    CallbackManagerForChainRun,
+)
+from langchain.chains.base import Chain
+from langchain.docstore.document import Document
+
+logger = logging.getLogger(__name__)
+
+
+class LoaderOutputChain(Chain):
+    """Chain that print the loader output.
+    """
+    documents: List[Document]
+    input_key: str = "begin"  #: :meta private:
+    output_key: str = "text"  #: :meta private:
+
+    @staticmethod
+    @functools.lru_cache
+    def _log_once(msg: str) -> None:
+        """Log a message once.
+
+        :meta private:
+        """
+        logger.warning(msg)
+
+    @property
+    def input_keys(self) -> List[str]:
+        """Expect input keys.
+
+        :meta private:
+        """
+        return [self.input_key]
+
+    @property
+    def output_keys(self) -> List[str]:
+        """Return output keys.
+
+        :meta private:
+        """
+        return [self.output_key]
+
+    def _call(
+        self,
+        inputs: Dict[str, str],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, str]:
+        contents = [doc.page_content for doc in self.documents]
+        contents = '\n\n'.join(contents)
+        # contents = json.dumps(contents, indent=2, ensure_ascii=False)
+        output = {self.output_key: contents}
+        return output
+
+    async def _acall(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        contents = [doc.page_content for doc in self.documents]
+        contents = json.dumps(contents, indent=2, ensure_ascii=False)
+        output = {self.output_key: contents}
+        return output
diff --git a/src/bisheng-langchain/bisheng_langchain/chat_models/__init__.py b/src/bisheng-langchain/bisheng_langchain/chat_models/__init__.py
@@ -1,4 +1,4 @@
-from .host_llm import HostBaichuanChat, HostChatGLM2, HostLlama2Chat, HostQwenChat
+from .host_llm import CustomLLMChat, HostBaichuanChat, HostChatGLM2, HostLlama2Chat, HostQwenChat
 from .minimax import ChatMinimaxAI
 from .proxy_llm import ProxyChatLLM
 from .wenxin import ChatWenxin
@@ -8,5 +8,6 @@
 __all__ = [
     'ProxyChatLLM', 'ChatMinimaxAI', 'ChatWenxin', 'ChatZhipuAI',
     'ChatXunfeiAI',
-    'HostChatGLM2', 'HostBaichuanChat', 'HostLlama2Chat', 'HostQwenChat'
+    'HostChatGLM2', 'HostBaichuanChat', 'HostLlama2Chat', 'HostQwenChat',
+    'CustomLLMChat'
 ]
diff --git a/src/bisheng-langchain/bisheng_langchain/chat_models/host_llm.py b/src/bisheng-langchain/bisheng_langchain/chat_models/host_llm.py
@@ -409,3 +409,44 @@ class HostLlama2Chat(BaseHostChatLLM):
     def _llm_type(self) -> str:
         """Return type of chat model."""
         return 'llama2_chat'
+
+
+class CustomLLMChat(BaseHostChatLLM):
+    # use custom llm chat api, api should compatiable with openai definition
+    model_name: str = Field('custom-llm-chat', alias='model')
+
+    temperature: float = 0.1
+    top_p: float = 0.1
+    max_tokens: int = 8192
+
+    @property
+    def _llm_type(self) -> str:
+        """Return type of chat model."""
+        return 'custom_llm_chat'
+
+    def completion_with_retry(self, **kwargs: Any) -> Any:
+        retry_decorator = _create_retry_decorator(self)
+
+        @retry_decorator
+        def _completion_with_retry(**kwargs: Any) -> Any:
+            messages = kwargs.get('messages')
+            temperature = kwargs.get('temperature')
+            top_p = kwargs.get('top_p')
+            max_tokens = kwargs.get('max_tokens')
+            do_sample = kwargs.get('do_sample')
+            params = {
+                'messages': messages,
+                'model': self.model_name,
+                'top_p': top_p,
+                'temperature': temperature,
+                'max_tokens': max_tokens,
+                'do_sample': do_sample
+            }
+
+            if self.verbose:
+                print('payload', params)
+
+            resp = self.client(url=self.host_base_url, json=params).json()
+            return resp
+
+        return _completion_with_retry(**kwargs)
diff --git a/src/bisheng-langchain/bisheng_langchain/document_loaders/__init__.py b/src/bisheng-langchain/bisheng_langchain/document_loaders/__init__.py
@@ -1,4 +1,6 @@
 from .elem_pdf import PDFWithSemanticLoader
+
 from .elem_unstrcutured_loader import ElemUnstructuredLoader, ElemUnstructuredLoaderV0
 
-__all__ = ['PDFWithSemanticLoader', 'ElemUnstructuredLoader', 'ElemUnstructuredLoaderV0']
+__all__ = ['PDFWithSemanticLoader', 'ElemUnstructuredLoader', 'ElemUnstructuredLoaderV0', 'UniversalKVLoader']
+
diff --git a/src/bisheng-langchain/bisheng_langchain/document_loaders/parsers/__init__.py b/src/bisheng-langchain/bisheng_langchain/document_loaders/parsers/__init__.py
@@ -1,5 +1,9 @@
 from .image import LayoutParser
+from .ocr_client import OCRClient
+from .ellm_client import ELLMClient
 
 __all__ = [
     'LayoutParser',
+    'OCRClient',
+    'ELLMClient'
 ]
diff --git a/src/bisheng-langchain/bisheng_langchain/document_loaders/parsers/ellm_client.py b/src/bisheng-langchain/bisheng_langchain/document_loaders/parsers/ellm_client.py
@@ -0,0 +1,56 @@
+# import base64
+import copy
+import base64
+import requests
+from typing import Any, Iterator, List, Mapping, Optional, Union
+
+
+class ELLMClient(object):
+    def __init__(self,
+                 api_base_url: Optional[str] = None):
+        self.ep = api_base_url
+        self.client = requests.Session()
+        self.timeout = 10000
+        self.params = {
+            'sort_filter_boxes': True,
+            'enable_huarong_box_adjust': True,
+            'support_long_image_segment': True,
+            'checkbox': ['std_checkbox'],
+            'rotateupright': True
+        }
+
+        self.scene_mapping = {
+            'doc': {
+                'det': 'general_text_det_mrcnn_v1.0',
+                'recog': 'transformer-v2.8-gamma-faster',
+                'ellm': 'ELLM'
+            },
+            'form': {
+                'det': 'mrcnn-v5.1',
+                'recog': 'transformer-v2.8-gamma-faster',
+                'ellm': 'ELLM'
+            },
+            'hand': {
+                'det': 'mrcnn-v5.1',
+                'recog': 'transformer-hand-v1.16-faster',
+                'ellm': 'ELLM'
+            }
+        }
+
+    def predict(self, inp):
+        scene = inp.pop('scene', 'form')
+        b64_image = inp.pop('b64_image')
+        ellm_schema = inp.pop('keys')
+        params = copy.deepcopy(self.params)
+        params.update(self.scene_mapping[scene])
+        params.update({'ellm_schema': ellm_schema})
+
+        req_data = {'data': [b64_image], 'param': params}
+
+        try:
+            r = self.client.post(url=self.ep,
+                                 json=req_data,
+                                 timeout=self.timeout)
+            return r.json()
+        except Exception as e:
+            return {'status_code': 400, 'status_message': str(e)}