Skip to content

Commit

Permalink
add unstructloader
Browse files Browse the repository at this point in the history
增加溯源Node
  • Loading branch information
yaojin3616 committed Oct 15, 2023
2 parents b92a804 + b307ac4 commit 7f57de9
Show file tree
Hide file tree
Showing 20 changed files with 616 additions and 58 deletions.
9 changes: 9 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,15 @@ jobs:
twine upload dist/* -u ${{ secrets.PYPI_USER }} -p ${{ secrets.PYPI_PASSWORD }} --repository pypi
# 构建 backend 并推送到 Docker hub
- name: install poetry
uses: snok/install-poetry@v1
with:
installer-parallel: true
- name: build lock
run: |
cd ./src/backend
poetry lock
cd ../../
- name: Build backend and push
id: docker_build_backend
uses: docker/build-push-action@v2
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -257,4 +257,7 @@ dmypy.json
# docusaurus
.docusaurus/

sftp-config.json

/tmp/*
sftp-config.json
6 changes: 3 additions & 3 deletions docker/bisheng/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ admin:
password: "1234"

# bisheng-rt服务地址
bisheng-rt:
name: "RT-Server"
server: "192.168.0.1:9001"
# bisheng_rt:
# name: "RT-Server"
# server: "192.168.0.1:9001"

# 为知识库的embedding进行模型撇脂
knowledges:
Expand Down
4 changes: 2 additions & 2 deletions src/backend/bisheng/api/v1/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,11 @@ def get_chatlist_list(*, session: Session = Depends(get_session), Authorize: Aut
Authorize.jwt_required()
payload = json.loads(Authorize.get_jwt_subject())

smt = (select(ChatMessage.flow_id, ChatMessage.chat_id, ChatMessage.chat_id,
smt = (select(ChatMessage.flow_id, ChatMessage.chat_id,
func.max(ChatMessage.create_time).label('create_time'),
func.max(ChatMessage.update_time).label('update_time')).where(
ChatMessage.user_id == payload.get('user_id')).group_by(
ChatMessage.flow_id).order_by(func.max(ChatMessage.create_time).desc()))
ChatMessage.flow_id, ChatMessage.chat_id).order_by(func.max(ChatMessage.create_time).desc()))
db_message = session.exec(smt).all()
flow_ids = [message.flow_id for message in db_message]
db_flow = session.exec(select(Flow).where(Flow.id.in_(flow_ids))).all()
Expand Down
6 changes: 5 additions & 1 deletion src/backend/bisheng/api/v1/skillcenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@ def create_template(*, session: Session = Depends(get_session), template: Templa

# 增加 order_num x,x+65535
max_order = session.exec(select(Template).order_by(Template.order_num.desc()).limit(1)).first()
db_template.order_num = max_order.order_num + ORDER_GAP
# 如果没有数据,就从 65535 开始
if max_order is None:
db_template.order_num = ORDER_GAP
else:
db_template.order_num = max_order.order_num + ORDER_GAP
session.add(db_template)
session.commit()
session.refresh(db_template)
Expand Down
2 changes: 1 addition & 1 deletion src/backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ include = ["./bisheng/*", "bisheng/**/*"]
bisheng = "bisheng.__main__:main"

[tool.poetry.dependencies]
bisheng_langchain = "^0.1.1"
bisheng_langchain = "^0.1.6"
fastapi_jwt_auth = "^0.5.0"
redis = "^5.0.0"
jieba = "^0.42.1"
Expand Down
2 changes: 0 additions & 2 deletions src/bisheng-langchain/bisheng_langchain/chains/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from bisheng_langchain.chains.combine_documents.stuff import StuffDocumentsChain
from bisheng_langchain.chains.retrieval_qa.base import MultiRetrievalQA

__all__ = [
'StuffDocumentsChain',
'MultiRetrievalQA',
]
Empty file.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .host_llm import BaichuanChat, ChatGLM2Host, Llama2Chat, QwenChat
from .host_llm import HostBaichuanChat, HostChatGLM2, HostLlama2Chat, HostQwenChat
from .minimax import ChatMinimaxAI
from .proxy_llm import ProxyChatLLM
from .wenxin import ChatWenxin
Expand All @@ -7,5 +7,6 @@

__all__ = [
'ProxyChatLLM', 'ChatMinimaxAI', 'ChatWenxin', 'ChatZhipuAI',
'ChatXunfeiAI', 'Llama2Chat', 'ChatGLM2Host', 'BaichuanChat', 'QwenChat'
'ChatXunfeiAI',
'HostChatGLM2', 'HostBaichuanChat', 'HostLlama2Chat', 'HostQwenChat'
]
10 changes: 5 additions & 5 deletions src/bisheng-langchain/bisheng_langchain/chat_models/host_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ def get_num_tokens_from_messages(self, messages: List[BaseMessage]) -> int:
return num_tokens


class ChatGLM2Host(BaseHostChatLLM):
class HostChatGLM2(BaseHostChatLLM):
# chatglm2-12b, chatglm2-6b
model_name: str = Field('chatglm2-6b', alias='model')

Expand All @@ -367,7 +367,7 @@ def _llm_type(self) -> str:
return 'chatglm2'


class BaichuanChat(BaseHostChatLLM):
class HostBaichuanChat(BaseHostChatLLM):
# Baichuan-7B-Chat, Baichuan-13B-Chat
model_name: str = Field('Baichuan-13B-Chat', alias='model')

Expand All @@ -378,10 +378,10 @@ class BaichuanChat(BaseHostChatLLM):
@property
def _llm_type(self) -> str:
"""Return type of chat model."""
return 'baichang_chat'
return 'baichuan_chat'


class QwenChat(BaseHostChatLLM):
class HostQwenChat(BaseHostChatLLM):
# Qwen-7B-Chat
model_name: str = Field('Qwen-7B-Chat', alias='model')

Expand All @@ -395,7 +395,7 @@ def _llm_type(self) -> str:
return 'qwen_chat'


class Llama2Chat(BaseHostChatLLM):
class HostLlama2Chat(BaseHostChatLLM):
# Llama-2-7b-chat-hf, Llama-2-13b-chat-hf, Llama-2-70b-chat-hf
model_name: str = Field('Llama-2-7b-chat-hf', alias='model')

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .elem_pdf import PDFWithSemanticLoader
from .elem_unstrcutured_loader import ElemUnstructuredLoader, ElemUnstructuredLoaderV0

__all__ = ['PDFWithSemanticLoader']
__all__ = ['PDFWithSemanticLoader', 'ElemUnstructuredLoader', 'ElemUnstructuredLoaderV0']
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
# flake8: noqa
"""Loads PDF with semantic splilter."""
import base64
import io
import json
import logging
import os
import re
import tempfile
import time
from abc import ABC
from collections import Counter
from copy import deepcopy
from pathlib import Path
from typing import Any, Iterator, List, Mapping, Optional, Union
from urllib.parse import urlparse

import fitz
import numpy as np
import pypdfium2
import requests
from bisheng_langchain.document_loaders.parsers import LayoutParser
from langchain.docstore.document import Document
from langchain.document_loaders.blob_loaders import Blob
from langchain.document_loaders.pdf import BasePDFLoader
from shapely import Polygon
from shapely import box as Rect


def merge_partitions(partitions):
text_elem_sep = '\n'
doc_content = []
is_first_elem = True
last_label = ''
prev_length = 0
metadata = dict(bboxes=[], pages=[], indexes=[], types=[])
for part in partitions:
label, text = part['type'], part['text']
extra_data = part['metadata']['extra_data']
if is_first_elem:
f_text = text + '\n' if label == 'Title' else text
doc_content.append(f_text)
is_first_elem = False
else:
if last_label == 'Title' and label == 'Title':
doc_content.append('\n' + text + '\n')
elif label == 'Title':
doc_content.append('\n\n' + text + '\n')
elif label == 'Table':
doc_content.append('\n\n' + text + '\n')
else:
doc_content.append(text_elem_sep + text)

last_label = label
metadata['bboxes'].extend(
list(map(lambda x: list(map(int, x)), extra_data['bboxes'])))
metadata['pages'].extend(extra_data['pages'])
metadata['types'].extend(extra_data['types'])

indexes = extra_data['indexes']
up_indexes = [[s + prev_length, e + prev_length] for (s, e) in indexes]
metadata['indexes'].extend(up_indexes)
prev_length += len(doc_content[-1])

content = ''.join(doc_content)
return content, metadata


class ElemUnstructuredLoader(BasePDFLoader):
"""Loads a PDF with pypdf and chunks at character level. dummy version
Loader also stores page numbers in metadata.
"""
def __init__(self,
file_name: str,
file_path: str,
unstructured_api_key: str = None,
unstructured_api_url: str = None,
start: int = 0,
n: int = None,
verbose: bool = False) -> None:
"""Initialize with a file path."""
self.unstructured_api_url = unstructured_api_url
self.unstructured_api_key = unstructured_api_key
self.headers = {'Content-Type': 'application/json'}
self.file_name = file_name
self.start = start
self.n = n
super().__init__(file_path)


def load(self) -> List[Document]:
"""Load given path as pages."""
b64_data = base64.b64encode(open(self.file_path, 'rb').read()).decode()
payload = dict(
filename=os.path.basename(self.file_name),
b64_data=[b64_data],
mode='partition',
parameters={'start': self.start, 'n': self.n})

resp = requests.post(
self.unstructured_api_url,
headers=self.headers,
json=payload).json()

partitions = resp['partitions']
content, metadata = merge_partitions(partitions)
metadata['source'] = self.file_name

doc = Document(page_content=content, metadata=metadata)
return [doc]


class ElemUnstructuredLoaderV0(BasePDFLoader):
"""Loads a PDF with pypdf and chunks at character level. dummy version
Loader also stores page numbers in metadata.
"""
def __init__(self,
file_name : str,
file_path: str,
unstructured_api_key: str = None,
unstructured_api_url: str = None,
start: int = 0,
n: int = None,
verbose: bool = False) -> None:
"""Initialize with a file path."""
self.unstructured_api_url = unstructured_api_url
self.unstructured_api_key = unstructured_api_key
self.headers = {'Content-Type': 'application/json'}
self.file_name = file_name
super().__init__(file_path)

def load(self) -> List[Document]:
b64_data = base64.b64encode(open(self.file_path, 'rb').read()).decode()
payload = dict(
filename=os.path.basename(self.file_name),
b64_data=[b64_data],
mode='text')

resp = requests.post(
self.unstructured_api_url,
headers=self.headers,
json=payload).json()

page_content = resp['text']
meta = {'source': self.file_name}
doc = Document(page_content=page_content, metadata=meta)
return [doc]
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from bisheng_langchain.retrievers.mix_es_vector import MixEsVectorRetriever

__all__ = [
"MixEsVectorRetriever"
]
Loading

0 comments on commit 7f57de9

Please sign in to comment.