Skip to content

Commit

Permalink
Add compose file. (#612)
Browse files Browse the repository at this point in the history
  • Loading branch information
ZePan110 authored Sep 4, 2024
1 parent 1885a69 commit 7a21d09
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 6 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/_comps-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ jobs:
cd ${{ github.workspace }}
if [[ $(grep -c "llava-tgi:" ${docker_compose_yml}) != 0 ]]; then
git clone https://github.com/yuanwu2017/tgi-gaudi.git && cd tgi-gaudi && git checkout v2.0.4
git clone https://github.com/yuanwu2017/tgi-gaudi.git && cd tgi-gaudi && git checkout v2.0.4 && cd ../
fi
if [[ $(grep -c "vllm-openvino:" ${docker_compose_yml}) != 0 ]]; then
git clone https://github.com/vllm-project/vllm.git vllm-openvino
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/docker/compose/llms-compose-cd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@ services:
image: ${REGISTRY:-opea}/llm-native:${TAG:-latest}
vllm-openvino:
build:
dockerfile: vllm-openvino/Dockerfile.openvino
context: vllm-openvino
dockerfile: Dockerfile.openvino
image: ${REGISTRY:-opea}/vllm-openvino:${TAG:-latest}
3 changes: 2 additions & 1 deletion .github/workflows/docker/compose/lvms-compose-cd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,6 @@ services:
image: ${REGISTRY:-opea}/lvm-tgi:${TAG:-latest}
llava-tgi:
build:
dockerfile: tgi-gaudi/Dockerfile
context: tgi-gaudi
dockerfile: Dockerfile
image: ${REGISTRY:-opea}/llava-tgi:${TAG:-latest}
9 changes: 9 additions & 0 deletions .github/workflows/docker/compose/vectorstores-compose-cd.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# this file should be run in the root of the repo
services:
vectorstore-pathway:
build:
dockerfile: comps/vectorstores/langchain/pathway
image: ${REGISTRY:-opea}/vectorstore-pathway:${TAG:-latest}
49 changes: 46 additions & 3 deletions comps/dataprep/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@
from langchain_community.llms import HuggingFaceEndpoint
from PIL import Image

from comps import CustomLogger

logger = CustomLogger("prepare_doc_util")
logflag = os.getenv("LOGFLAG", False)


class TimeoutError(Exception):
pass
Expand Down Expand Up @@ -428,14 +433,51 @@ def fetch(self, url, headers=None, max_times=5):
if not headers:
headers = self.headers
while max_times:
if not url.startswith("http") or not url.startswith("https"):
parsed_url = urlparse(url)
if not parsed_url.scheme:
url = "http://" + url
print("start fetch %s...", url)
if logflag:
logger.info("start fetch %s..." % url)
try:
response = requests.get(url, headers=headers, verify=True)
if response.status_code != 200:
print("fail to fetch %s, response status code: %s", url, response.status_code)
else:
# Extract charset from the Content-Type header
content_type = response.headers.get("Content-Type", "").lower()
if "charset=" in content_type:
# Extract charset value from the content-type header
charset = content_type.split("charset=")[-1].strip()
response.encoding = charset
if logflag:
logger.info(f"Charset detected and set: {response.encoding}")
else:
import re

# Extract charset from the response HTML content
charset_from_meta = None
# Check for <meta charset="...">
match = re.search(r'<meta\s+charset=["\']?([^"\'>]+)["\']?', response.text, re.IGNORECASE)
if match:
charset_from_meta = match.group(1)
# Check for <meta http-equiv="Content-Type" content="...; charset=...">
if not charset_from_meta:
match = re.search(
r'<meta\s+http-equiv=["\']?content-type["\']?\s+content=["\']?[^"\']*charset=([^"\'>]+)["\']?',
response.text,
re.IGNORECASE,
)
if match:
charset_from_meta = match.group(1)
if charset_from_meta:
response.encoding = charset_from_meta
if logflag:
logger.info(f"Charset detected and set from meta tag: {response.encoding}")
else:
# Fallback to default encoding
response.encoding = "utf-8"
if logflag:
logger.info("Charset not specified, using default utf-8")
return response
except Exception as e:
print("fail to fetch %s, caused by %s", url, e)
Expand Down Expand Up @@ -540,8 +582,9 @@ def load_html_data(url):
main_content = all_text if main_content == "" else main_content
main_content = main_content.replace("\n", "")
main_content = main_content.replace("\n\n", "")
main_content = uni_pro(main_content)
main_content = re.sub(r"\s+", " ", main_content)
if logflag:
logger.info("main_content=[%s]" % main_content)

return main_content

Expand Down

0 comments on commit 7a21d09

Please sign in to comment.