-
Notifications
You must be signed in to change notification settings - Fork 123
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Feature] [Optimum] [Intel] [OpenVINO] Add OpenVINO backend support through Optimum-Intel #454
base: main
Are you sure you want to change the base?
Changes from 6 commits
c32c660
e727951
06cc5e3
087d714
a73316d
295e840
7535101
7fda9bf
81756bd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,7 @@ cpu: | |
# "RUN poetry install --no-interaction --no-ansi --no-root --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all" | ||
COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh | ||
RUN ./requirements_install_from_poetry.sh --no-root --without lint,test "https://download.pytorch.org/whl/cpu" | ||
RUN poetry run python -m pip install --upgrade --upgrade-strategy eager "optimum[openvino]" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. logic: Installing optimum[openvino] after requirements_install_from_poetry.sh may override dependency versions. Consider integrating this into the requirements script |
||
|
||
amd: | ||
# 2 . command: jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s amd > Dockerfile.amd_auto | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -40,12 +40,14 @@ COPY poetry.lock poetry.toml pyproject.toml README.md /app/ | |
# "RUN poetry install --no-interaction --no-ansi --no-root --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all" | ||
COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh | ||
RUN ./requirements_install_from_poetry.sh --no-root --without lint,test "https://download.pytorch.org/whl/cpu" | ||
RUN poetry run python -m pip install --upgrade --upgrade-strategy eager "optimum[openvino]" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. logic: Installing optimum[openvino] after poetry install could override poetry-managed dependencies. Consider adding optimum[openvino] to pyproject.toml instead. |
||
|
||
COPY infinity_emb infinity_emb | ||
# Install dependency with infinity_emb package | ||
# "RUN poetry install --no-interaction --no-ansi --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all" | ||
COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh | ||
RUN ./requirements_install_from_poetry.sh --without lint,test "https://download.pytorch.org/whl/cpu" | ||
RUN poetry run python -m pip install --upgrade --upgrade-strategy eager "optimum[openvino]" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. logic: Redundant installation of optimum[openvino] - this package was already installed in the previous stage |
||
|
||
# | ||
|
||
|
@@ -55,6 +57,7 @@ FROM builder as testing | |
# "RUN poetry install --no-interaction --no-ansi --extras \"${EXTRAS}\" --with lint,test && poetry cache clear pypi --all" | ||
COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh | ||
RUN ./requirements_install_from_poetry.sh --with lint,test "https://download.pytorch.org/whl/cpu" | ||
RUN poetry run python -m pip install --upgrade --upgrade-strategy eager "optimum[openvino]" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. logic: Third redundant installation of optimum[openvino] - consider consolidating into a single installation in base image |
||
|
||
# lint | ||
RUN poetry run ruff check . | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
# Autogenerated warning: | ||
# This file is generated from Dockerfile.jinja2. Do not edit the Dockerfile.cuda|cpu|amd file directly. | ||
# Only contribute to the Dockerfile.jinja2 and dockerfile_template.yaml and regenerate the Dockerfile.cuda|cpu|amd | ||
|
||
FROM ubuntu:22.04 AS base | ||
|
||
ENV PYTHONUNBUFFERED=1 \ | ||
\ | ||
# pip | ||
PIP_NO_CACHE_DIR=off \ | ||
PIP_DISABLE_PIP_VERSION_CHECK=on \ | ||
PIP_DEFAULT_TIMEOUT=100 \ | ||
\ | ||
# make poetry create the virtual environment in the project's root | ||
# it gets named `.venv` | ||
POETRY_VIRTUALENVS_CREATE="true" \ | ||
POETRY_VIRTUALENVS_IN_PROJECT="true" \ | ||
# do not ask any interactive question | ||
POETRY_NO_INTERACTION=1 \ | ||
EXTRAS="all" \ | ||
PYTHON="python3.11" | ||
RUN apt-get update && apt-get install --no-install-recommends -y build-essential python3-dev libsndfile1 $PYTHON-venv $PYTHON curl | ||
WORKDIR /app | ||
|
||
FROM base as builder | ||
# Set the working directory for the app | ||
# Define the version of Poetry to install (default is 1.7.1) | ||
# Define the directory to install Poetry to (default is /opt/poetry) | ||
ARG POETRY_VERSION=1.8.4 | ||
ARG POETRY_HOME=/opt/poetry | ||
# Create a Python virtual environment for Poetry and install it | ||
RUN curl -sSL https://install.python-poetry.org | POETRY_HOME=$POETRY_HOME POETRY_VERSION=$POETRY_VERSION $PYTHON - | ||
ENV PATH=$POETRY_HOME/bin:$PATH | ||
# Test if Poetry is installed in the expected path | ||
RUN echo "Poetry version:" && poetry --version | ||
# Copy the rest of the app source code (this layer will be invalidated and rebuilt whenever the source code changes) | ||
COPY poetry.lock poetry.toml pyproject.toml README.md /app/ | ||
# Install dependencies only | ||
# | ||
# "RUN poetry install --no-interaction --no-ansi --no-root --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all" | ||
COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh | ||
RUN ./requirements_install_from_poetry.sh --no-root --without lint,test "https://download.pytorch.org/whl/cpu" | ||
|
||
RUN poetry run python -m pip install --upgrade --upgrade-strategy eager "optimum[openvino]" | ||
|
||
COPY infinity_emb infinity_emb | ||
# Install dependency with infinity_emb package | ||
# "RUN poetry install --no-interaction --no-ansi --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all" | ||
COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh | ||
RUN ./requirements_install_from_poetry.sh --without lint,test "https://download.pytorch.org/whl/cpu" | ||
|
||
# | ||
|
||
|
||
FROM builder as testing | ||
# install lint and test dependencies | ||
# "RUN poetry install --no-interaction --no-ansi --extras \"${EXTRAS}\" --with lint,test && poetry cache clear pypi --all" | ||
COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh | ||
RUN ./requirements_install_from_poetry.sh --with lint,test "https://download.pytorch.org/whl/cpu" | ||
|
||
# # lint | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. logic: All testing stages are commented out. Tests should be enabled to ensure OpenVINO backend works correctly. |
||
# # RUN poetry run ruff check . | ||
# # RUN poetry run mypy . | ||
# # pytest | ||
# COPY tests tests | ||
# # run end to end tests because of duration of build in github ci. | ||
# # Run tests/end_to_end on TARGETPLATFORM x86_64 otherwise run tests/end_to_end_gpu | ||
# # poetry run python -m pytest tests/end_to_end -x # TODO: does not work. | ||
# RUN if [ -z "$TARGETPLATFORM" ]; then \ | ||
# ARCH=$(uname -m); \ | ||
# if [ "$ARCH" = "x86_64" ]; then \ | ||
# TARGETPLATFORM="linux/amd64"; \ | ||
# elif [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then \ | ||
# TARGETPLATFORM="linux/arm64"; \ | ||
# else \ | ||
# echo "Unsupported architecture: $ARCH"; exit 1; \ | ||
# fi; \ | ||
# fi; \ | ||
# echo "Running tests on TARGETPLATFORM=$TARGETPLATFORM"; \ | ||
# if [ "$TARGETPLATFORM" = "linux/arm64" ] ; then \ | ||
# poetry run python -m pytest tests/end_to_end/test_api_with_dummymodel.py -x ; \ | ||
# else \ | ||
# poetry run python -m pytest tests/end_to_end/test_api_with_dummymodel.py tests/end_to_end/test_sentence_transformers.py -m "not performance" -x ; \ | ||
# fi | ||
# RUN echo "all tests passed" > "test_results.txt" | ||
|
||
|
||
# # Use a multi-stage build -> production version, with download | ||
# FROM base AS tested-builder | ||
# COPY --from=builder /app /app | ||
# # force testing stage to run | ||
# COPY --from=testing /app/test_results.txt /app/test_results.txt | ||
# ENV HF_HOME=/app/.cache/huggingface | ||
# ENV PATH=/app/.venv/bin:$PATH | ||
# # do nothing | ||
# RUN echo "copied all files" | ||
|
||
|
||
# Export with tensorrt, not recommended. | ||
# docker buildx build --target=production-tensorrt -f Dockerfile . | ||
# FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS production-tensorrt | ||
# ENV PYTHONUNBUFFERED=1 \ | ||
# PIP_NO_CACHE_DIR=off \ | ||
# PYTHON="python3.11" | ||
# RUN apt-get update && apt-get install python3-dev python3-pip $PYTHON build-essential curl -y | ||
# COPY --from=builder /app /app | ||
# # force testing stage to run | ||
# COPY --from=testing /app/test_results.txt /app/test_results.txt | ||
# ENV HF_HOME=/app/.cache/torch | ||
# ENV PATH=/app/.venv/bin:$PATH | ||
# RUN pip install --no-cache-dir "onnxruntime-gpu==1.17.0" "tensorrt==8.6.*" | ||
# ENV LD_LIBRARY_PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/$(PYTHON)/site-packages/tensorrt_libs:${LD_LIBRARY_PATH} | ||
# ENV PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt/bin:${PATH} | ||
# ENTRYPOINT ["infinity_emb"] | ||
|
||
|
||
# # Use a multi-stage build -> production version, with download | ||
# # docker buildx build --target=production-with-download \ | ||
# # --build-arg MODEL_NAME=BAAI/bge-small-en-v1.5 --build-arg ENGINE=torch -f Dockerfile -t infinity-BAAI-small . | ||
# FROM tested-builder AS production-with-download | ||
# # collect model name and engine from build args | ||
# ARG MODEL_NAME | ||
# RUN if [ -z "${MODEL_NAME}" ]; then echo "Error: Build argument MODEL_NAME not set." && exit 1; fi | ||
# ARG ENGINE | ||
# RUN if [ -z "${ENGINE}" ]; then echo "Error: Build argument ENGINE not set." && exit 1; fi | ||
# # will exit with 3 if model is downloaded # TODO: better exit code | ||
# RUN infinity_emb v2 --model-id $MODEL_NAME --engine $ENGINE --preload-only || [ $? -eq 3 ] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. style: Exit code 3 handling needs to be properly documented and implemented |
||
# ENTRYPOINT ["infinity_emb"] | ||
|
||
# # Use a multi-stage build -> production version | ||
# FROM tested-builder AS production | ||
# ENTRYPOINT ["infinity_emb"] | ||
Comment on lines
+130
to
+132
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. logic: Missing active production stage definition. Current production stage is commented out. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,15 +6,19 @@ | |
|
||
import numpy as np | ||
|
||
from infinity_emb._optional_imports import CHECK_ONNXRUNTIME, CHECK_TRANSFORMERS | ||
from infinity_emb._optional_imports import ( | ||
CHECK_ONNXRUNTIME, | ||
CHECK_TRANSFORMERS, | ||
CHECK_OPTIMUM_INTEL, | ||
) | ||
from infinity_emb.args import EngineArgs | ||
from infinity_emb.primitives import EmbeddingReturnType, PoolingMethod | ||
from infinity_emb.transformer.abstract import BaseEmbedder | ||
from infinity_emb.transformer.quantization.interface import quant_embedding_decorator | ||
from infinity_emb.transformer.utils_optimum import ( | ||
cls_token_pooling, | ||
device_to_onnx, | ||
get_onnx_files, | ||
# get_onnx_files, | ||
mean_pooling, | ||
normalize, | ||
optimize_model, | ||
|
@@ -25,43 +29,80 @@ | |
from optimum.onnxruntime import ( # type: ignore[import-untyped] | ||
ORTModelForFeatureExtraction, | ||
) | ||
from infinity_emb.transformer.utils_optimum import get_onnx_files | ||
|
||
except (ImportError, RuntimeError, Exception) as ex: | ||
CHECK_ONNXRUNTIME.mark_dirty(ex) | ||
|
||
|
||
if CHECK_OPTIMUM_INTEL.is_available: | ||
try: | ||
from optimum.intel import OVModelForFeatureExtraction # type: ignore[import-untyped] | ||
from infinity_emb.transformer.utils_optimum import get_openvino_files | ||
|
||
except (ImportError, RuntimeError, Exception) as ex: | ||
CHECK_OPTIMUM_INTEL.mark_dirty(ex) | ||
|
||
|
||
if CHECK_TRANSFORMERS.is_available: | ||
from transformers import AutoConfig, AutoTokenizer # type: ignore[import-untyped] | ||
|
||
|
||
class OptimumEmbedder(BaseEmbedder): | ||
def __init__(self, *, engine_args: EngineArgs): | ||
CHECK_ONNXRUNTIME.mark_required() | ||
provider = device_to_onnx(engine_args.device) | ||
self.provider = provider | ||
|
||
if provider == "OpenVINOExecutionProvider": | ||
CHECK_OPTIMUM_INTEL.mark_required() | ||
filename = "" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. logic: empty filename could cause issues if get_openvino_files fails. Consider setting a default model path or handling this case explicitly |
||
try: | ||
openvino_file = get_openvino_files( | ||
model_name_or_path=engine_args.model_name_or_path, | ||
revision=engine_args.revision, | ||
use_auth_token=True, | ||
) | ||
filename = openvino_file.as_posix() | ||
except Exception as e: # show error then let the optimum intel compress on the fly | ||
print(str(e)) | ||
Comment on lines
+66
to
+67
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. logic: printing error to stdout could mask critical failures. Consider proper error logging or propagating the exception |
||
|
||
self.model = optimize_model( | ||
model_name_or_path=engine_args.model_name_or_path, | ||
revision=engine_args.revision, | ||
trust_remote_code=engine_args.trust_remote_code, | ||
execution_provider=provider, | ||
file_name=filename, | ||
optimize_model=not os.environ.get( | ||
"INFINITY_ONNX_DISABLE_OPTIMIZE", False | ||
), # TODO: make this env variable public | ||
model_class=OVModelForFeatureExtraction, | ||
) | ||
|
||
onnx_file = get_onnx_files( | ||
model_name_or_path=engine_args.model_name_or_path, | ||
revision=engine_args.revision, | ||
use_auth_token=True, | ||
prefer_quantized="cpu" in provider.lower(), | ||
) | ||
elif provider == "CPUExecutionProvider": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. logic: missing else clause for unsupported providers could lead to undefined model state |
||
CHECK_ONNXRUNTIME.mark_required() | ||
onnx_file = get_onnx_files( | ||
model_name_or_path=engine_args.model_name_or_path, | ||
revision=engine_args.revision, | ||
use_auth_token=True, | ||
prefer_quantized="cpu" in provider.lower(), | ||
) | ||
self.model = optimize_model( | ||
model_name_or_path=engine_args.model_name_or_path, | ||
revision=engine_args.revision, | ||
trust_remote_code=engine_args.trust_remote_code, | ||
execution_provider=provider, | ||
file_name=onnx_file.as_posix(), | ||
optimize_model=not os.environ.get( | ||
"INFINITY_ONNX_DISABLE_OPTIMIZE", False | ||
), # TODO: make this env variable public | ||
model_class=ORTModelForFeatureExtraction, | ||
) | ||
self.model.use_io_binding = False | ||
|
||
self.pooling = ( | ||
mean_pooling if engine_args.pooling_method == PoolingMethod.mean else cls_token_pooling | ||
) | ||
|
||
self.model = optimize_model( | ||
model_name_or_path=engine_args.model_name_or_path, | ||
revision=engine_args.revision, | ||
trust_remote_code=engine_args.trust_remote_code, | ||
execution_provider=provider, | ||
file_name=onnx_file.as_posix(), | ||
optimize_model=not os.environ.get( | ||
"INFINITY_ONNX_DISABLE_OPTIMIZE", False | ||
), # TODO: make this env variable public | ||
model_class=ORTModelForFeatureExtraction, | ||
) | ||
self.model.use_io_binding = False | ||
|
||
self.tokenizer = AutoTokenizer.from_pretrained( | ||
engine_args.model_name_or_path, | ||
revision=engine_args.revision, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
style: Consider pinning the optimum[openvino] version to ensure reproducible builds