Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Evaluate on long context (32k,64k etc..) on 30B/70B large models #48

Open
CaesarWWK opened this issue Jan 9, 2024 · 5 comments
Open

Comments

@CaesarWWK
Copy link

Hi,

I found that the original script cannot handle large models on long context effectively, since it use multiprocess to load an entire model on a single gpu.

I also tried different methods to add support 30B/70B models such as deepspeed-inference ,accelerate, vllm. Finally vllm can support benchmark on large models with long context (34B with 32k context with a 8*a800 node in my case) and it requires minimum changes to the original code.

I hope this information can help people who also want to evaluate on large models

@bys0318
Copy link
Member

bys0318 commented Jan 10, 2024

You are right. We will update soon to also support 30B/70B models with accelerate/deepspeed.

@lvjianxin
Copy link

You are right. We will update soon to also support 30B/70B models with accelerate/deepspeed.

waiting

@lvjianxin
Copy link

Hi,

I found that the original script cannot handle large models on long context effectively, since it use multiprocess to load an entire model on a single gpu.

I also tried different methods to add support 30B/70B models such as deepspeed-inference ,accelerate, vllm. Finally vllm can support benchmark on large models with long context (34B with 32k context with a 8*a800 node in my case) and it requires minimum changes to the original code.

I hope this information can help people who also want to evaluate on large models

how todo it,can you share your codes?

@CaesarWWK
Copy link
Author

CaesarWWK commented Jan 15, 2024

Hi,
I found that the original script cannot handle large models on long context effectively, since it use multiprocess to load an entire model on a single gpu.
I also tried different methods to add support 30B/70B models such as deepspeed-inference ,accelerate, vllm. Finally vllm can support benchmark on large models with long context (34B with 32k context with a 8*a800 node in my case) and it requires minimum changes to the original code.
I hope this information can help people who also want to evaluate on large models

how todo it,can you share your codes?

You may need some modification to the code, I removed model2path and some other stuff. World_size and args.s are useless in this version, you can remove them

import os
from datasets import load_dataset
import torch
import json
from tqdm import tqdm
import numpy as np
import random
import argparse
import torch.distributed as dist
import torch.multiprocessing as mp
from vllm import LLM, SamplingParams
def parse_args(args=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', type=str, default=None, )
    parser.add_argument('--e', action='store_true', help="Evaluate on LongBench-E")
    parser.add_argument('--s',help='model size in B')
    parser.add_argument('--debug', action='store_true', help="Debug mode")
    parser.add_argument('--checkpoint', type=str, help="checkpoint_path")
    parser.add_argument('--quantize',  action='store_true', help="Debug mode")

    return parser.parse_args(args)

# This is the customized building prompt for chat models
def build_chat(tokenizer, prompt, model_name):
    if "chatglm3" in model_name:
        prompt = tokenizer.build_chat_input(prompt)
    elif "chatglm" in model_name:
        prompt = tokenizer.build_prompt(prompt)
    elif "longchat" in model_name or "vicuna" in model_name:
        from fastchat.model import get_conversation_template
        conv = get_conversation_template("vicuna")
        conv.append_message(conv.roles[0], prompt)
        conv.append_message(conv.roles[1], None)
        prompt = conv.get_prompt()
    elif "llama2" in model_name:
        prompt = f"[INST]{prompt}[/INST]"
    elif "xgen" in model_name:
        header = (
            "A chat between a curious human and an artificial intelligence assistant. "
            "The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n"
        )
        prompt = header + f" ### Human: {prompt}\n###"
    elif "internlm" in model_name:
        prompt = f"<|User|>:{prompt}<eoh>\n<|Bot|>:"
    return prompt

def post_process(response, model_name):
    if "xgen" in model_name:
        response = response.strip().replace("Assistant:", "")
    elif "internlm" in model_name:
        response = response.split("<eoa>")[0]
    return response


    # dist.destroy_process_group()

def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.cuda.manual_seed_all(seed)



if __name__ == '__main__':
    seed_everything(42)
    args = parse_args()
   


    mp.set_start_method('spawn', force=True)

    model2path = json.load(open("config/model2path.json", "r"))
    model2maxlen = json.load(open("config/model2maxlen.json", "r"))
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_name = args.model
    # define your model
    max_length = model2maxlen[model_name]
    if args.e:
        datasets = ["trec",
 "samsum","triviaqa"]
                  #  
    else:
        # datasets = ["narrativeqa", "qasper", "multifieldqa_en", "multifieldqa_zh", "hotpotqa", "2wikimqa", "musique", \
        #             "dureader", "gov_report", "qmsum", "multi_news", "vcsum", "trec", "triviaqa", "samsum", "lsht", \
        #             "passage_count", "passage_retrieval_en", "passage_retrieval_zh", "lcc", "repobench-p"]
        datasets = ["lsht","narrativeqa"]
    # we design specific prompt format and max generation length for each task, feel free to modify them to optimize model output
    dataset2prompt = json.load(open("config/dataset2prompt.json", "r"))
    dataset2maxlen = json.load(open("config/dataset2maxlen.json", "r"))
    # predict on each dataset
    if not os.path.exists("pred"):
        os.makedirs("pred")
    if not os.path.exists("pred_e"):
        os.makedirs("pred_e")
    print(args.checkpoint)
   
    if args.quantize:
            model = LLM(model=args.checkpoint,tensor_parallel_size=8,trust_remote_code=True,quantization="AWQ")
    else:
        model = LLM(model=args.checkpoint,tensor_parallel_size=8,trust_remote_code=True)
 
    for dataset in datasets:
        if args.e:
            data=load_dataset("json", data_files=f"/home/wangweikuan1996/llm-eval/long_benchmark_data/{dataset}_e.jsonl", split="train")
            # data = load_dataset('THUDM/LongBench', f"{dataset}_e", split='test')
            if not os.path.exists(f"pred_e/{model_name}"):
                os.makedirs(f"pred_e/{model_name}")
            out_path = f"pred_e/{model_name}/{dataset}.jsonl"
        else:
            data = load_dataset("json", data_files=f"/home/wangweikuan1996/llm-eval/long_benchmark_data/{dataset}.jsonl", split="train")
            if not os.path.exists(f"pred/{model_name}"):
                os.makedirs(f"pred/{model_name}")
            out_path = f"pred/{model_name}/{dataset}.jsonl"
        if os.path.exists(out_path):
            os.remove(out_path)
        prompt_format = dataset2prompt[dataset]
        max_gen = dataset2maxlen[dataset]
        data_all = [data_sample for data_sample in data]
        if int(args.s)>18:
            world_size=8
        else:
            world_size = 8
        print(world_size)
        if args.debug:
            world_size=1
        data_split=8//world_size
        data_subsets = [data_all[i::data_split] for i in range(data_split)]
        sampling_params = SamplingParams(max_tokens=dataset2maxlen[dataset],use_beam_search=True,n=2,
                        temperature=0.0)
        for json_obj in tqdm(data):
            prompt = prompt_format.format(**json_obj)
            output=model.generate(prompt, sampling_params)
            pred=output[0].outputs[0].text
            if pred=='':
                print(output)
           
            pred = post_process(pred, model_name)
            with open(out_path, "a", encoding="utf-8") as f:
                json.dump({"pred": pred, "answers": json_obj["answers"], "all_classes": json_obj["all_classes"], "length": json_obj["length"]}, f, ensure_ascii=False)
                f.write('\n')
            # get_pred(0,1,data_all,max_length,max_gen,prompt_format,dataset,device,model_name,model2path,out_path)
        

@bys0318
Copy link
Member

bys0318 commented Jan 20, 2024

Hi, @CaesarWWK Thanks for your reply! @lvjianxin An easy way (without modifying much to the current codebase) might be to add device_map="auto" to the model loading lines in load_model_and_tokenizer(). It will support model inference on multi-gpus.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants