Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

Reduce the UT evaluation time #498

Merged
merged 11 commits into from
Oct 19, 2023
78 changes: 26 additions & 52 deletions tests/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,9 @@ class TestLmEvaluationHarness(unittest.TestCase):
@classmethod
def setUpClass(self):
self.clm_model = AutoModelForCausalLM.from_pretrained(
"facebook/opt-125m",
"hf-internal-testing/tiny-random-gptj",
torchscript=True
)
self.seq2seq_model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
tmp_model = torch.jit.trace(
self.clm_model, self.clm_model.dummy_inputs["input_ids"]
)
Expand All @@ -33,32 +32,31 @@ def tearDownClass(self):
shutil.rmtree("./gptj", ignore_errors=True)
shutil.rmtree("./gptj-past", ignore_errors=True)
shutil.rmtree("./evaluation_results.json", ignore_errors=True)
shutil.rmtree("./llama", ignore_errors=True)
cmd = 'pip uninstall lm_eval -y'
p = subprocess.Popen(cmd, preexec_fn=os.setsid, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, shell=True) # nosec
p.communicate()


def test_evaluate_for_casualLM(self):
def test_evaluate_for_CasualLM(self):
from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate
results = evaluate(
model="hf-causal",
model_args='pretrained="hf-internal-testing/tiny-random-gptj",tokenizer="hf-internal-testing/tiny-random-gptj",dtype=float32',
tasks=["piqa"],
limit=20,
limit=5,
)
self.assertEqual(results["results"]["piqa"]["acc"], 0.45)
self.assertEqual(results["results"]["piqa"]["acc"], 0.6)

def test_evaluate_for_Seq2SeqLM(self):
from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate
results = evaluate(
model="hf-seq2seq",
model_args='pretrained="hf-internal-testing/tiny-random-t5",tokenizer="hf-internal-testing/tiny-random-t5",dtype=float32',
tasks=["piqa"],
limit=20,
limit=5,
)
self.assertEqual(results["results"]["piqa"]["acc"], 0.60)
self.assertEqual(results["results"]["piqa"]["acc"], 1.0)

def test_evaluate_for_JitModel(self):
from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate
Expand All @@ -67,34 +65,26 @@ def test_evaluate_for_JitModel(self):
model_args='pretrained="hf-internal-testing/tiny-random-gptj",tokenizer="hf-internal-testing/tiny-random-gptj",dtype=float32',
user_model=self.jit_model,
tasks=["piqa"],
limit=20,
limit=5,
)
self.assertEqual(results["results"]["piqa"]["acc"], 0.65)
self.assertEqual(results["results"]["piqa"]["acc"], 0.6)

def test_lambada_for_llama(self):
from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate
results = evaluate(
model="hf-causal",
model_args='pretrained="decapoda-research/llama-7b-hf",tokenizer="decapoda-research/llama-7b-hf",dtype=float32',
tasks=["lambada_openai", "lambada_standard"],
limit=20,
)
self.assertEqual(results["results"]["lambada_standard"]["acc"], 0.75)
self.assertEqual(results["results"]["lambada_openai"]["acc"], 0.70)

def test_cnn_daily(self):
from intel_extension_for_transformers.llm.evaluation.hf_eval import summarization_evaluate
model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
results = summarization_evaluate(
model=self.clm_model,
model=model,
tokenizer_name="facebook/opt-125m",
batch_size=1,
limit=5,
)
self.assertEqual(results["rouge2"], 18.0431)
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
results = summarization_evaluate(
model=self.seq2seq_model, tokenizer_name="t5-small", batch_size=1, limit=5
model=model, tokenizer_name="t5-small", batch_size=1, limit=5
)
self.assertEqual(results["rouge2"], 9.6795)
self.assertEqual(results["rouge2"], 9.5858)

def test_evaluate_for_ort_Seq2SeqLM(self):
from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate
Expand All @@ -108,10 +98,10 @@ def test_evaluate_for_ort_Seq2SeqLM(self):
model="hf-seq2seq",
model_args='pretrained="./t5-past",tokenizer="./t5-past",dtype=float32',
tasks=["piqa"],
limit=20,
limit=5,
model_format="onnx"
)
self.assertEqual(results["results"]["piqa"]["acc"], 0.60)
self.assertEqual(results["results"]["piqa"]["acc"], 1.0)

# test evaluate encoder_model + decoder_model + decoder_with_past_model
merged_model_path = "./t5-past/decoder_model_merged.onnx"
Expand All @@ -121,10 +111,10 @@ def test_evaluate_for_ort_Seq2SeqLM(self):
model="hf-seq2seq",
model_args='pretrained="./t5-past",tokenizer="./t5-past",dtype=float32',
tasks=["piqa"],
limit=20,
limit=5,
model_format="onnx"
)
self.assertEqual(results["results"]["piqa"]["acc"], 0.60)
self.assertEqual(results["results"]["piqa"]["acc"], 1.0)

# test evaluate encoder_model + decoder_model
cmd = 'optimum-cli export onnx --model hf-internal-testing/tiny-random-t5 --task text2text-generation t5/'
Expand All @@ -135,12 +125,12 @@ def test_evaluate_for_ort_Seq2SeqLM(self):
model="hf-seq2seq",
model_args='pretrained="./t5",tokenizer="./t5",dtype=float32',
tasks=["piqa"],
limit=20,
limit=5,
model_format="onnx"
)
self.assertEqual(results["results"]["piqa"]["acc"], 0.60)
self.assertEqual(results["results"]["piqa"]["acc"], 1.0)

def test_evaluate_for_ort_casualLM(self):
def test_evaluate_for_ort_CasualLM(self):
from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate
cmd = 'optimum-cli export onnx --model hf-internal-testing/tiny-random-gptj --task text-generation-with-past gptj-past/'
p = subprocess.Popen(cmd, preexec_fn=os.setsid, stdout=subprocess.PIPE,
Expand All @@ -152,10 +142,10 @@ def test_evaluate_for_ort_casualLM(self):
model="hf-causal",
model_args='pretrained="./gptj-past",tokenizer="./gptj-past",dtype=float32',
tasks=["piqa"],
limit=20,
limit=5,
model_format="onnx"
)
self.assertEqual(results["results"]["piqa"]["acc"], 0.45)
self.assertEqual(results["results"]["piqa"]["acc"], 0.6)

# test evaluate decoder_model + decoder_with_past_model
merged_model_path = "./gptj-past/decoder_model_merged.onnx"
Expand All @@ -165,10 +155,10 @@ def test_evaluate_for_ort_casualLM(self):
model="hf-causal",
model_args='pretrained="./gptj-past",tokenizer="./gptj-past",dtype=float32',
tasks=["piqa"],
limit=20,
limit=5,
model_format="onnx"
)
self.assertEqual(results["results"]["piqa"]["acc"], 0.45)
self.assertEqual(results["results"]["piqa"]["acc"], 0.6)

# test evaluate decoder_model
cmd = 'optimum-cli export onnx --model hf-internal-testing/tiny-random-gptj --task text-generation gptj/'
Expand All @@ -179,27 +169,11 @@ def test_evaluate_for_ort_casualLM(self):
model="hf-causal",
model_args='pretrained="./gptj",tokenizer="./gptj",dtype=float32',
tasks=["piqa"],
limit=20,
limit=5,
model_format="onnx"
)
self.assertEqual(results["results"]["piqa"]["acc"], 0.45)
self.assertEqual(results["results"]["piqa"]["acc"], 0.6)


def test_tokenizer_for_llama(self):
from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate
cmd = 'optimum-cli export onnx --model decapoda-research/llama-7b-hf --task text-generation llama/'
p = subprocess.Popen(cmd, preexec_fn=os.setsid, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, shell=True) # nosec
p.communicate()

results = evaluate(
model="hf-causal",
model_args='pretrained="./llama",tokenizer="decapoda-research/llama-7b-hf"',
tasks=["lambada_openai"],
limit=20,
model_format="onnx"
)
self.assertEqual(results["results"]["lambada_openai"]["acc"], 0.70)

if __name__ == "__main__":
unittest.main()
Loading