From 07c98a5263967af6afd7eb58119c5c5504d9a9f2 Mon Sep 17 00:00:00 2001 From: Artur Fierka Date: Fri, 18 Oct 2024 09:13:23 +0200 Subject: [PATCH] Workaround for OOM during loading llama-405 (#396) Repeating missing code --- vllm/model_executor/models/llama.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 18ce8d7f7d164..a64edc94825f3 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -3,6 +3,7 @@ # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# Copyright 2024 Habana Labs, Ltd. an Intel Company # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX # and OPT implementations in this library. It has been modified from its @@ -420,6 +421,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + if is_hpu: + torch.hpu.synchronize() # If this function is called, it should always initialize KV cache scale # factors (or else raise an exception). Thus, handled exceptions should