diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 32157c16..3384e0b6 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -163,7 +163,7 @@ RUN microdnf install -y \ ARG PYTHON_VERSION # 0.4.2 is built for CUDA 12.1 and PyTorch 2.3.0 -ARG VLLM_WHEEL_VERSION=0.4.3 +ARG VLLM_WHEEL_VERSION=0.5.0.post1 RUN curl -Lo vllm.whl https://github.com/vllm-project/vllm/releases/download/v${VLLM_WHEEL_VERSION}/vllm-${VLLM_WHEEL_VERSION}-cp${PYTHON_VERSION//.}-cp${PYTHON_VERSION//.}-manylinux1_x86_64.whl \ && unzip vllm.whl \ @@ -277,11 +277,16 @@ ENV VLLM_NCCL_SO_PATH=/usr/local/lib/libnccl.so.2 RUN --mount=type=cache,target=/root/.cache/pip \ pip3 install \ # additional dependencies for the TGIS gRPC server - grpcio-tools==1.63.0 \ + grpcio-tools \ # additional dependencies for openai api_server accelerate==0.30.0 \ # hf_transfer for faster HF hub downloads - hf_transfer==0.1.6 + hf_transfer==0.1.6 \ + # additional dependencies for OpenTelemetry tracing + opentelemetry-sdk \ + opentelemetry-api \ + opentelemetry-exporter-otlp \ + opentelemetry-semantic-conventions-ai # Triton needs a CC compiler RUN microdnf install -y gcc \ diff --git a/examples/production_monitoring/Otel.md b/examples/production_monitoring/Otel.md index 14494422..15701c8c 100644 --- a/examples/production_monitoring/Otel.md +++ b/examples/production_monitoring/Otel.md @@ -32,14 +32,20 @@ export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger) export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317 ``` - Then set vLLM's service name for OpenTelemetry, enable insecure connections to Jaeger and run vLLM: + Then set vLLM's service name for OpenTelemetry, enable insecure connections to Jaeger and run vLLM with the OpenAI endpoint: ``` export OTEL_SERVICE_NAME="vllm-server" export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT" ``` + or run vLLM with the grpc endpoint: + ``` + export OTEL_SERVICE_NAME="vllm-server" + export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true + python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT" --grpc-port 50051 + ``` -1. In a new shell, send requests with trace context from a dummy client +1. In a new shell, send requests with trace context from a dummy http client ``` export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger) export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317 @@ -47,6 +53,15 @@ export OTEL_SERVICE_NAME="client-service" python dummy_client.py ``` + or a dummy grpc client: + ``` + export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger) + export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317 + export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true + export OTEL_SERVICE_NAME="client-service" + python dummy_client_grpc.py + ``` + 1. Open Jaeger webui: http://localhost:16686/ diff --git a/examples/production_monitoring/dummy_client_grpc.py b/examples/production_monitoring/dummy_client_grpc.py new file mode 100644 index 00000000..9e0f99a6 --- /dev/null +++ b/examples/production_monitoring/dummy_client_grpc.py @@ -0,0 +1,41 @@ +import grpc +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import ( + OTLPSpanExporter) +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import (BatchSpanProcessor, + ConsoleSpanExporter) +from opentelemetry.trace import SpanKind, set_tracer_provider +from opentelemetry.trace.propagation.tracecontext import ( + TraceContextTextMapPropagator) + +from vllm.entrypoints.grpc.pb import generation_pb2, generation_pb2_grpc + +trace_provider = TracerProvider() +set_tracer_provider(trace_provider) + +trace_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter())) +trace_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter())) +tracer = trace_provider.get_tracer("dummy-client") + +with grpc.insecure_channel("localhost:50051") as channel: + stub = generation_pb2_grpc.GenerationServiceStub(channel) + + with tracer.start_as_current_span("client-span", + kind=SpanKind.CLIENT) as span: + prompt = "San Francisco is a" + span.set_attribute("prompt", prompt) + + # Inject the current context into the gRPC metadata + headers = {} + TraceContextTextMapPropagator().inject(headers) + metadata = list(headers.items()) + + reqs = [generation_pb2.GenerationRequest(text=prompt, )] + + req = generation_pb2.BatchedGenerationRequest( + model_id="facebook/opt-125m", + requests=reqs, + params=generation_pb2.Parameters( + sampling=generation_pb2.SamplingParameters(temperature=0.0), + stopping=generation_pb2.StoppingCriteria(max_new_tokens=10))) + response = stub.Generate(req, metadata=metadata) diff --git a/vllm/entrypoints/grpc/grpc_server.py b/vllm/entrypoints/grpc/grpc_server.py index 15fc1dcf..0d84bc3e 100644 --- a/vllm/entrypoints/grpc/grpc_server.py +++ b/vllm/entrypoints/grpc/grpc_server.py @@ -43,6 +43,8 @@ TypicalLogitsWarperWrapper) from vllm.tgis_utils.metrics import (FailureReasonLabel, ServiceMetrics, TGISStatLogger) +from vllm.tracing import (contains_trace_headers, extract_trace_headers, + log_tracing_disabled_warning) from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup logger = init_logger(__name__) @@ -168,12 +170,20 @@ async def Generate(self, request: BatchedGenerationRequest, prompt=req.text, prompt_token_ids=input_ids ) + is_tracing_enabled = await self.engine.is_tracing_enabled() + headers = dict(context.invocation_metadata()) + trace_headers = None + if is_tracing_enabled: + trace_headers = extract_trace_headers(headers) + if not is_tracing_enabled and contains_trace_headers(headers): + log_tracing_disabled_warning() generators.append( # prompt is supplied for observability, the text is not # re-tokenized when `prompt_token_ids` is supplied self.engine.generate(inputs=inputs, sampling_params=sampling_params, request_id=f"{request_id}-{i}", + trace_headers=trace_headers, **adapter_kwargs), )