Skip to content

Commit

Permalink
#0: update HF llama-2 version
Browse files Browse the repository at this point in the history
  • Loading branch information
farbabi authored and tt-rkim committed Nov 20, 2023
1 parent ea40f8d commit b2e9fd9
Show file tree
Hide file tree
Showing 8 changed files with 33 additions and 83 deletions.
6 changes: 3 additions & 3 deletions models/experimental/llama_old/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@ In order to use HuggingFace Llama model we need a development version of Transfo

# Use pretrained weights

The weights used in the tests are downloaded from: https://huggingface.co/decapoda-research/llama-7b-hf
The weights used in the tests are downloaded from: https://huggingface.co/baffo32/decapoda-research-llama-7B-hf

How to use the weights:
```
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("decapoda-research/llama-7b-hf")
model = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-7b-hf")
tokenizer = AutoTokenizer.from_pretrained(hf-internal-testing/llama-tokenizer)
model = AutoModelForCausalLM.from_pretrained("baffo32/decapoda-research-llama-7B-hf)
```

An issue which probably will appear is:
Expand Down
29 changes: 7 additions & 22 deletions models/experimental/llama_old/tests/test_llama_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,7 @@
from transformers import AutoTokenizer, AutoModelForCausalLM

from models.experimental.llama_old.llama_utils import *
from models.utility_functions import (
comp_allclose,
comp_pcc,
torch_to_tt_tensor_rm,
tt_to_torch_tensor
)
from models.utility_functions import comp_allclose, comp_pcc, torch_to_tt_tensor_rm, tt_to_torch_tensor

from models.experimental.llama_old.tt.llama_attention import TtLlamaAttention

Expand All @@ -35,16 +30,12 @@ def forward(self, x, y):
return result


def run_test_LlamaAttention_inference(
device, model_version, tokenizer_version, batch, seq_len, on_weka, pcc
):
def run_test_LlamaAttention_inference(device, model_version, tokenizer_version, batch, seq_len, on_weka, pcc):
model_name = model_version
tokenizer_name = tokenizer_version

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
hugging_face_reference_model = AutoModelForCausalLM.from_pretrained(
model_name, torch_dtype=torch.float32
)
hugging_face_reference_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32)
hugging_face_reference_model.eval()

configuration = hugging_face_reference_model.config
Expand Down Expand Up @@ -72,9 +63,7 @@ def run_test_LlamaAttention_inference(
position_ids = position_ids.unsqueeze(0).view(-1, seq_length)

# PyTorch output =======================================================================
pytorch_LlamaAttention_model = PytorchLlamaAttentionModel(
hugging_face_reference_model, layer_num
)
pytorch_LlamaAttention_model = PytorchLlamaAttentionModel(hugging_face_reference_model, layer_num)
pytorch_out = pytorch_LlamaAttention_model(x=attention_input, y=position_ids)

# TT hardware execution =================================================================
Expand Down Expand Up @@ -114,7 +103,7 @@ def run_test_LlamaAttention_inference(
"model_version, tokenizer_version, batch, seq_len, on_weka, pcc",
(
pytest.param(
"decapoda-research/llama-7b-hf",
"baffo32/decapoda-research-llama-7B-hf",
"hf-internal-testing/llama-tokenizer",
1,
128,
Expand All @@ -123,9 +112,5 @@ def run_test_LlamaAttention_inference(
),
),
)
def test_LlamaAttention_inference(
device, model_version, tokenizer_version, batch, seq_len, on_weka, pcc
):
run_test_LlamaAttention_inference(
device, model_version, tokenizer_version, batch, seq_len, on_weka, pcc
)
def test_LlamaAttention_inference(device, model_version, tokenizer_version, batch, seq_len, on_weka, pcc):
run_test_LlamaAttention_inference(device, model_version, tokenizer_version, batch, seq_len, on_weka, pcc)
6 changes: 2 additions & 4 deletions models/experimental/llama_old/tests/test_llama_causallm.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,7 @@ def run_test_llamaCausallm_inference(
tokenizer_name = tokenizer_version

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
hugging_face_reference_model = AutoModelForCausalLM.from_pretrained(
model_name, torch_dtype=torch.float32
)
hugging_face_reference_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32)
hugging_face_reference_model.eval()
configuration = hugging_face_reference_model.config
state_dict = hugging_face_reference_model.state_dict()
Expand Down Expand Up @@ -91,7 +89,7 @@ def run_test_llamaCausallm_inference(
"model_version, tokenizer_version, batch, seq_len, num_decoders, max_position_embeddings, on_weka, pcc",
(
(
"decapoda-research/llama-7b-hf",
"baffo32/decapoda-research-llama-7B-hf",
"hf-internal-testing/llama-tokenizer",
4,
128,
Expand Down
19 changes: 5 additions & 14 deletions models/experimental/llama_old/tests/test_llama_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from torch import nn



from typing import Tuple

from transformers import AutoTokenizer, AutoModelForCausalLM
Expand Down Expand Up @@ -45,9 +44,7 @@ def run_test_LlamaDecoder_inference(
tokenizer_name = tokenizer_version

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
hugging_face_reference_model = AutoModelForCausalLM.from_pretrained(
model_name, torch_dtype=torch.float32
)
hugging_face_reference_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32)
hugging_face_reference_model.eval()
configuration = hugging_face_reference_model.config
state_dict = hugging_face_reference_model.state_dict()
Expand All @@ -73,9 +70,7 @@ def run_test_LlamaDecoder_inference(
position_ids = position_ids.unsqueeze(0).view(-1, seq_length)

# PyTorch output =======================================================================
pytorch_LlamaDecoder_model = PytorchLlamaDecoderModel(
hugging_face_reference_model, decoder_id
)
pytorch_LlamaDecoder_model = PytorchLlamaDecoderModel(hugging_face_reference_model, decoder_id)
pytorch_LlamaDecoder_model.eval()
pytorch_out = pytorch_LlamaDecoder_model(x=llama_input, y=position_ids)

Expand All @@ -92,9 +87,7 @@ def run_test_LlamaDecoder_inference(
max_position_embeddings,
configuration,
)
tt_out = tt_LlamaDecoder_model(
hidden_states=tt_llama_input, position_ids=position_ids
)
tt_out = tt_LlamaDecoder_model(hidden_states=tt_llama_input, position_ids=position_ids)
# transform to PyTorch tensor
# take only hidden_states tensor if tuple is obtained
tt_out = tt_to_torch_tensor(tt_out[0])
Expand All @@ -117,7 +110,7 @@ def run_test_LlamaDecoder_inference(
"model_version, tokenizer_version, batch, seq_len, decoder_id, on_weka, pcc",
(
(
"decapoda-research/llama-7b-hf",
"baffo32/decapoda-research-llama-7B-hf",
"hf-internal-testing/llama-tokenizer",
1,
128,
Expand All @@ -127,9 +120,7 @@ def run_test_LlamaDecoder_inference(
),
),
)
def test_LlamaDecoder_inference(
device, model_version, tokenizer_version, batch, seq_len, decoder_id, on_weka, pcc
):
def test_LlamaDecoder_inference(device, model_version, tokenizer_version, batch, seq_len, decoder_id, on_weka, pcc):
run_test_LlamaDecoder_inference(
device,
model_version,
Expand Down
24 changes: 7 additions & 17 deletions models/experimental/llama_old/tests/test_llama_layer_norm.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,13 @@ def forward(self, x):
return result


def run_test_LlamaLayerNorm_inference(
device, model_version, tokenizer_version, batch, seq_len, on_weka, pcc
):
def run_test_LlamaLayerNorm_inference(device, model_version, tokenizer_version, batch, seq_len, on_weka, pcc):
model_name = model_version
tokenizer_name = tokenizer_version

# https://huggingface.co/decapoda-research/llama-7b-hf
# https://huggingface.co/baffo32/decapoda-research-llama-7B-hf
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
hugging_face_reference_model = AutoModelForCausalLM.from_pretrained(
model_name, torch_dtype=torch.float32
)
hugging_face_reference_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32)
hugging_face_reference_model.eval()
configuration = hugging_face_reference_model.config
state_dict = hugging_face_reference_model.state_dict()
Expand All @@ -56,9 +52,7 @@ def run_test_LlamaLayerNorm_inference(
layer_num = 0

# PyTorch output ---------------------------------------------------------------------
pytorch_LlamaRMSNorm_model = PytorchLlamaRMSNormModel(
hugging_face_reference_model, layer_num
)
pytorch_LlamaRMSNorm_model = PytorchLlamaRMSNormModel(hugging_face_reference_model, layer_num)
pytorch_out = pytorch_LlamaRMSNorm_model(llama_layer_norm_input)
logger.info(f"PyTorch output shape: {pytorch_out.shape}")

Expand Down Expand Up @@ -98,7 +92,7 @@ def run_test_LlamaLayerNorm_inference(
"model_version, tokenizer_version, batch, seq_len, on_weka, pcc",
(
(
"decapoda-research/llama-7b-hf",
"baffo32/decapoda-research-llama-7B-hf",
"hf-internal-testing/llama-tokenizer",
1,
2048,
Expand All @@ -107,9 +101,5 @@ def run_test_LlamaLayerNorm_inference(
),
),
)
def test_LlamaLayerNorm_inference(
model_version, tokenizer_version, batch, seq_len, on_weka, pcc, device
):
run_test_LlamaLayerNorm_inference(
device, model_version, tokenizer_version, batch, seq_len, on_weka, pcc
)
def test_LlamaLayerNorm_inference(model_version, tokenizer_version, batch, seq_len, on_weka, pcc, device):
run_test_LlamaLayerNorm_inference(device, model_version, tokenizer_version, batch, seq_len, on_weka, pcc)
23 changes: 6 additions & 17 deletions models/experimental/llama_old/tests/test_llama_mlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from torch import nn



from transformers import AutoTokenizer, AutoModelForCausalLM


Expand All @@ -35,16 +34,12 @@ def forward(self, x):
return result


def run_test_LlamaMLP_inference(
device, model_version, tokenizer_version, batch, seq_len, on_weka, pcc
):
def run_test_LlamaMLP_inference(device, model_version, tokenizer_version, batch, seq_len, on_weka, pcc):
model_name = model_version
tokenizer_name = tokenizer_version

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
hugging_face_reference_model = AutoModelForCausalLM.from_pretrained(
model_name, torch_dtype=torch.float32
)
hugging_face_reference_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32)
hugging_face_reference_model.eval()
configuration = hugging_face_reference_model.config
state_dict = hugging_face_reference_model.state_dict()
Expand All @@ -56,9 +51,7 @@ def run_test_LlamaMLP_inference(
base_url = "model.layers"

# PyTorch output --------------------------------------------------------------------
pytorch_LlamaMLP_model = PytorchLlamaMLPModel(
hugging_face_reference_model, layer_num
)
pytorch_LlamaMLP_model = PytorchLlamaMLPModel(hugging_face_reference_model, layer_num)
pytorch_out = pytorch_LlamaMLP_model(llama_mlp_input) # .unsqueeze(1)

# TT hardware execution -------------------------------------------------------------
Expand Down Expand Up @@ -94,7 +87,7 @@ def run_test_LlamaMLP_inference(
"model_version, tokenizer_version, batch, seq_len, on_weka, pcc",
(
(
"decapoda-research/llama-7b-hf",
"baffo32/decapoda-research-llama-7B-hf",
"hf-internal-testing/llama-tokenizer",
1,
2048,
Expand All @@ -103,9 +96,5 @@ def run_test_LlamaMLP_inference(
),
),
)
def test_LlamaMLP_inference(
device, model_version, tokenizer_version, batch, seq_len, on_weka, pcc
):
run_test_LlamaMLP_inference(
device, model_version, tokenizer_version, batch, seq_len, on_weka, pcc
)
def test_LlamaMLP_inference(device, model_version, tokenizer_version, batch, seq_len, on_weka, pcc):
run_test_LlamaMLP_inference(device, model_version, tokenizer_version, batch, seq_len, on_weka, pcc)
7 changes: 2 additions & 5 deletions models/experimental/llama_old/tests/test_llama_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import pytest



from models.experimental.llama.llama_utils import *

from transformers import (
Expand Down Expand Up @@ -38,9 +37,7 @@ def run_test_Llama_inference(
tokenizer_name = tokenizer_version

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
hugging_face_reference_model = AutoModelForCausalLM.from_pretrained(
model_name, torch_dtype=torch.float32
)
hugging_face_reference_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32)
hugging_face_reference_model.eval()
configuration = hugging_face_reference_model.config
state_dict = hugging_face_reference_model.state_dict()
Expand Down Expand Up @@ -98,7 +95,7 @@ def run_test_Llama_inference(
"model_version, tokenizer_version, batch, seq_len, num_decoders, max_position_embeddings, on_weka, pcc",
(
(
"decapoda-research/llama-7b-hf",
"baffo32/decapoda-research-llama-7B-hf",
"hf-internal-testing/llama-tokenizer",
1,
64,
Expand Down
2 changes: 1 addition & 1 deletion models/experimental/llama_old/tests/test_perf_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@


def run_perf_llama(expected_inference_time, expected_compile_time, device):
model_version = "decapoda-research/llama-7b-hf"
model_version = "baffo32/decapoda-research-llama-7B-hf"
tokenizer_version = "hf-internal-testing/llama-tokenizer"
batch = 1
seq_len = 64
Expand Down

0 comments on commit b2e9fd9

Please sign in to comment.