tenstorrent · pmarkovicTT · Jan 20, 2025 · Jan 20, 2025 · Jan 21, 2025 · Feb 4, 2025
@@ -6,6 +6,7 @@
 import pytest
 
 import forge
+from forge.verify.verify import verify
 from test.mlir.llama.utils.utils import load_model
 
 
@@ -124,3 +125,30 @@ def test_llama_inference_cache_cpu(model_path):
     # Generated text
     generated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
     print(generated_text)
+
+
+@pytest.mark.parametrize(
+    "model_path",
+    [
+        "openlm-research/open_llama_3b",
+        pytest.param("meta-llama/Llama-3.2-1B", marks=pytest.mark.xfail(reason="Unsupported Op: repeat_interleave")),
+    ],
+)
+@pytest.mark.parametrize("seq_len", [128, 512, 2048])
+def test_llama_input_sequence_lengths(model_path, seq_len):
+    # Load Model and Tokenizer
+    framework_model, tokenizer = load_model(model_path, num_hidden_layers=1)
+
+    # Adjust tokenizer for max sequence length padding
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "right"
+    tokenizer.model_max_length = seq_len
+
+    prompt = "Q: What is the largest animal?\nA:"
+    input_ids = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
+    input_ids = input_ids.to(torch.int32)
+
+    # Compile the model and run fwd pass
+    compiled_model = forge.compile(framework_model, input_ids)
+
+    verify([input_ids], framework_model, compiled_model)
@@ -16,6 +16,7 @@ def load_model(model_path="openlm-research/open_llama_3b", **kwargs):
     config.use_cache = kwargs.get("use_cache", False)
     config.output_attentions = kwargs.get("output_attentions", False)
     config.output_hidden_states = kwargs.get("output_hidden_states", False)
+    config.num_hidden_layers = kwargs.get("num_hidden_layers", 26)
 
     # Load the model
     framework_model = LlamaForCausalLM.from_pretrained(model_path, device_map="auto", config=config)