diff --git a/eval.py b/eval.py new file mode 100644 index 000000000..32993f382 --- /dev/null +++ b/eval.py @@ -0,0 +1,187 @@ +import inspect +import os +import traceback +from typing import Union + +import numpy as np +import pandas as pd +import torch +from matplotlib import pyplot as plt + +from generate import eval_func_param_names, eval_extra_columns, get_context, get_score_model, get_model, \ + inputs_kwargs_list, evaluate +from prompter import Prompter +from utils import clear_torch_cache, NullContext, get_kwargs + + +def run_eval( # for local function: + base_model=None, lora_weights=None, + prompt_type=None, debug=None, chat=False, chat_context=None, stream_output=None, + eval_sharegpt_prompts_only=None, eval_sharegpt_prompts_only_seed=None, eval_sharegpt_as_output=None, + examples=None, is_low_mem=None, + # for get_model: + score_model=None, load_8bit=None, load_4bit=None, load_half=None, infer_devices=None, tokenizer_base_model=None, + gpu_id=None, local_files_only=None, resume_download=None, use_auth_token=None, + trust_remote_code=None, offload_folder=None, compile_model=None, + # for evaluate: + src_lang=None, tgt_lang=None, concurrency_count=None, save_dir=None, sanitize_bot_response=None, + model_state0=None, raise_generate_gpu_exceptions=None, load_db_if_exists=None, dbs=None, user_path=None, + use_openai_embedding=None, use_openai_model=None, hf_embedding_model=None, chunk=None, chunk_size=None, + db_type=None, n_jobs=None, first_para=None, text_limit=None, +): + if eval_sharegpt_prompts_only > 0: + # override default examples with shareGPT ones for human-level eval purposes only + eval_filename = 'ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json' + if not os.path.isfile(eval_filename): + os.system( + 'wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % eval_filename) + import json + data = json.load(open(eval_filename, 'rt')) + # focus on data that starts with human, else likely chopped from other data + turn_start = 0 # odd in general + data = [x for x in data if len(x['conversations']) > turn_start + 1 and + x['conversations'][turn_start]['from'] == 'human' and + x['conversations'][turn_start + 1]['from'] == 'gpt'] + np.random.seed(eval_sharegpt_prompts_only_seed) + example1 = examples[-1] # pick reference example + examples = [] + responses = [] + for i in list(np.random.randint(0, len(data), size=eval_sharegpt_prompts_only)): + assert data[i]['conversations'][turn_start]['from'] == 'human' + instruction = data[i]['conversations'][turn_start]['value'] + assert data[i]['conversations'][turn_start + 1]['from'] == 'gpt' + output = data[i]['conversations'][turn_start + 1]['value'] + examplenew = example1.copy() + assert not chat, "No gradio must use chat=False, uses nochat instruct" + examplenew[eval_func_param_names.index('instruction_nochat')] = instruction + examplenew[eval_func_param_names.index('iinput_nochat')] = '' # no input + examplenew[eval_func_param_names.index('context')] = get_context(chat_context, prompt_type) + examples.append(examplenew) + responses.append(output) + + num_examples = len(examples) + scoring_path = 'scoring' + os.makedirs(scoring_path, exist_ok=True) + if eval_sharegpt_as_output: + used_base_model = 'gpt35' + used_lora_weights = '' + else: + used_base_model = str(base_model.split('/')[-1]) + used_lora_weights = str(lora_weights.split('/')[-1]) + eval_filename = "df_scores_%s_%s_%s_%s_%s_%s.parquet" % (num_examples, eval_sharegpt_prompts_only, + eval_sharegpt_prompts_only_seed, + eval_sharegpt_as_output, + used_base_model, + used_lora_weights) + eval_filename = os.path.join(scoring_path, eval_filename) + + # torch.device("cuda") leads to cuda:x cuda:y mismatches for multi-GPU consistently + n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0 + device = 'cpu' if n_gpus == 0 else 'cuda' + context_class = NullContext if n_gpus > 1 or n_gpus == 0 else torch.device + + with context_class(device): + # ensure was set right above before examples generated + assert not stream_output, "stream_output=True does not make sense with example loop" + import time + from functools import partial + + # get score model + smodel, stokenizer, sdevice = get_score_model(reward_type=True, + **get_kwargs(get_score_model, exclude_names=['reward_type'], + **locals())) + + if not eval_sharegpt_as_output: + model, tokenizer, device = get_model(reward_type=False, + **get_kwargs(get_model, exclude_names=['reward_type'], **locals())) + model_state = [model, tokenizer, device, base_model] + my_db_state = [None] + fun = partial(evaluate, model_state, my_db_state, + **get_kwargs(evaluate, exclude_names=['model_state', 'my_db_state'] + eval_func_param_names, + **locals())) + else: + assert eval_sharegpt_prompts_only > 0 + + def get_response(*args, exi=0): + # assumes same ordering of examples and responses + yield responses[exi] + + fun = get_response + t0 = time.time() + score_dump = [] + + for exi, ex in enumerate(examples): + instruction = ex[eval_func_param_names.index('instruction_nochat')] + iinput = ex[eval_func_param_names.index('iinput_nochat')] + context = ex[eval_func_param_names.index('context')] + clear_torch_cache() + print("") + print("START" + "=" * 100) + print("Question: %s %s" % (instruction, ('input=%s' % iinput if iinput else ''))) + print("-" * 105) + # fun yields as generator, so have to iterate over it + # Also means likely do NOT want --stream_output=True, else would show all generations + gener = fun(*tuple(ex), exi=exi) if eval_sharegpt_as_output else fun(*tuple(ex)) + for res in gener: + print(res) + if smodel: + score_with_prompt = False + if score_with_prompt: + data_point = dict(instruction=instruction, input=iinput, context=context) + prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output) + prompt = prompter.generate_prompt(data_point) + else: + # just raw input and output + if eval_sharegpt_prompts_only > 0: + # only our own examples have this filled at moment + assert iinput in [None, ''], iinput # should be no iinput + if not (chat_context and prompt_type == 'human_bot'): + assert context in [None, ''], context # should be no context + prompt = instruction + cutoff_len = 768 if is_low_mem else 2048 + inputs = stokenizer(prompt, res, + return_tensors="pt", + truncation=True, + max_length=cutoff_len) + try: + score = torch.sigmoid(smodel(**inputs).logits[0].float()).cpu().detach().numpy()[0] + except torch.cuda.OutOfMemoryError as e: + print("GPU OOM 1: question: %s answer: %s exception: %s" % (prompt, res, str(e)), + flush=True) + traceback.print_exc() + score = 0.0 + clear_torch_cache() + except (Exception, RuntimeError) as e: + if 'Expected all tensors to be on the same device' in str(e) or \ + 'expected scalar type Half but found Float' in str(e) or \ + 'probability tensor contains either' in str(e) or \ + 'cublasLt ran into an error!' in str(e): + print("GPU error: question: %s answer: %s exception: %s" % (prompt, res, str(e)), + flush=True) + traceback.print_exc() + score = 0.0 + clear_torch_cache() + else: + raise + print("SCORE %s: %s" % (exi, score), flush=True) + score_dump.append(ex + [prompt, res, score]) + # dump every score in case abort + df_scores = pd.DataFrame(score_dump, + columns=eval_func_param_names + eval_extra_columns) + df_scores.to_parquet(eval_filename, index=False) + # plot histogram so far + plt.figure(figsize=(10, 10)) + plt.hist(df_scores['score'], bins=20) + score_avg = np.mean(df_scores['score']) + score_median = np.median(df_scores['score']) + plt.title("Score avg: %s median: %s" % (score_avg, score_median)) + plt.savefig(eval_filename.replace('.parquet', '.png')) + plt.close() + + print("END" + "=" * 102) + print("") + t2 = time.time() + print("Time taken so far: %.4f about %.4g per example" % (t2 - t0, (t2 - t0) / (1 + exi))) + t1 = time.time() + print("Total time taken: %.4f about %.4g per example" % (t1 - t0, (t1 - t0) / num_examples)) + return eval_filename diff --git a/generate.py b/generate.py index abc717f88..1a3c74a27 100644 --- a/generate.py +++ b/generate.py @@ -15,7 +15,7 @@ from loaders import get_loaders from utils import set_seed, clear_torch_cache, save_generate_output, NullContext, wrapped_partial, EThread, get_githash, \ - import_matplotlib, get_device, makedirs + import_matplotlib, get_device, makedirs, get_kwargs import_matplotlib() from matplotlib import pyplot as plt @@ -54,6 +54,7 @@ def main( tokenizer_base_model: str = '', lora_weights: str = "", gpu_id: int = 0, + compile_model: bool = True, prompt_type: Union[int, str] = None, # input to generation @@ -146,6 +147,7 @@ def main( :param tokenizer_base_model: tokenizer HF-type name :param lora_weights: LORA weights path/HF link :param gpu_id: if infer_devices, then use gpu_id for cuda device ID, or auto mode if gpu_id != -1 + :param compile_model Whether to compile the model :param prompt_type: type of prompt, usually matched to fine-tuned model or plain for foundational model :param temperature: generation temperature :param top_p: generation top_p @@ -374,156 +376,8 @@ def main( assert 'langchain' not in sys.modules, "Dev bug, import of langchain when should not have" if not gradio: - if eval_sharegpt_prompts_only > 0: - # override default examples with shareGPT ones for human-level eval purposes only - eval_filename = 'ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json' - if not os.path.isfile(eval_filename): - os.system( - 'wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % eval_filename) - import json - data = json.load(open(eval_filename, 'rt')) - # focus on data that starts with human, else likely chopped from other data - turn_start = 0 # odd in general - data = [x for x in data if len(x['conversations']) > turn_start + 1 and - x['conversations'][turn_start]['from'] == 'human' and - x['conversations'][turn_start + 1]['from'] == 'gpt'] - np.random.seed(eval_sharegpt_prompts_only_seed) - example1 = examples[-1] # pick reference example - examples = [] - responses = [] - for i in list(np.random.randint(0, len(data), size=eval_sharegpt_prompts_only)): - assert data[i]['conversations'][turn_start]['from'] == 'human' - instruction = data[i]['conversations'][turn_start]['value'] - assert data[i]['conversations'][turn_start + 1]['from'] == 'gpt' - output = data[i]['conversations'][turn_start + 1]['value'] - examplenew = example1.copy() - assert not chat, "No gradio must use chat=False, uses nochat instruct" - examplenew[eval_func_param_names.index('instruction_nochat')] = instruction - examplenew[eval_func_param_names.index('iinput_nochat')] = '' # no input - examplenew[eval_func_param_names.index('context')] = get_context(chat_context, prompt_type) - examples.append(examplenew) - responses.append(output) - - num_examples = len(examples) - scoring_path = 'scoring' - os.makedirs(scoring_path, exist_ok=True) - if eval_sharegpt_as_output: - used_base_model = 'gpt35' - used_lora_weights = '' - else: - used_base_model = str(base_model.split('/')[-1]) - used_lora_weights = str(lora_weights.split('/')[-1]) - eval_filename = "df_scores_%s_%s_%s_%s_%s_%s.parquet" % (num_examples, eval_sharegpt_prompts_only, - eval_sharegpt_prompts_only_seed, - eval_sharegpt_as_output, - used_base_model, - used_lora_weights) - eval_filename = os.path.join(scoring_path, eval_filename) - - # torch.device("cuda") leads to cuda:x cuda:y mismatches for multi-GPU consistently - device = 'cpu' if n_gpus == 0 else 'cuda' - context_class = NullContext if n_gpus > 1 or n_gpus == 0 else torch.device - - with context_class(device): - # ensure was set right above before examples generated - assert not stream_output, "stream_output=True does not make sense with example loop" - import time - from functools import partial - - # get score model - smodel, stokenizer, sdevice = get_score_model(**locals()) - - if not eval_sharegpt_as_output: - model, tokenizer, device = get_model(**locals()) - model_state = [model, tokenizer, device, base_model] - kwargs_evaluate = {k: v for k, v in locals().items() if k in inputs_kwargs_list} - my_db_state = [None] - fun = partial(evaluate, model_state, my_db_state, **kwargs_evaluate) - else: - assert eval_sharegpt_prompts_only > 0 - - def get_response(*args, exi=0): - # assumes same ordering of examples and responses - yield responses[exi] - - fun = get_response - t0 = time.time() - score_dump = [] - - for exi, ex in enumerate(examples): - instruction = ex[eval_func_param_names.index('instruction_nochat')] - iinput = ex[eval_func_param_names.index('iinput_nochat')] - context = ex[eval_func_param_names.index('context')] - clear_torch_cache() - print("") - print("START" + "=" * 100) - print("Question: %s %s" % (instruction, ('input=%s' % iinput if iinput else ''))) - print("-" * 105) - # fun yields as generator, so have to iterate over it - # Also means likely do NOT want --stream_output=True, else would show all generations - gener = fun(*tuple(ex), exi=exi) if eval_sharegpt_as_output else fun(*tuple(ex)) - for res in gener: - print(res) - if smodel: - score_with_prompt = False - if score_with_prompt: - data_point = dict(instruction=instruction, input=iinput, context=context) - prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output) - prompt = prompter.generate_prompt(data_point) - else: - # just raw input and output - if eval_sharegpt_prompts_only > 0: - # only our own examples have this filled at moment - assert iinput in [None, ''], iinput # should be no iinput - if not (chat_context and prompt_type == 'human_bot'): - assert context in [None, ''], context # should be no context - prompt = instruction - cutoff_len = 768 if is_low_mem else 2048 - inputs = stokenizer(prompt, res, - return_tensors="pt", - truncation=True, - max_length=cutoff_len) - try: - score = torch.sigmoid(smodel(**inputs).logits[0].float()).cpu().detach().numpy()[0] - except torch.cuda.OutOfMemoryError as e: - print("GPU OOM 1: question: %s answer: %s exception: %s" % (prompt, res, str(e)), - flush=True) - traceback.print_exc() - score = 0.0 - clear_torch_cache() - except (Exception, RuntimeError) as e: - if 'Expected all tensors to be on the same device' in str(e) or \ - 'expected scalar type Half but found Float' in str(e) or \ - 'probability tensor contains either' in str(e) or \ - 'cublasLt ran into an error!' in str(e): - print("GPU error: question: %s answer: %s exception: %s" % (prompt, res, str(e)), - flush=True) - traceback.print_exc() - score = 0.0 - clear_torch_cache() - else: - raise - print("SCORE %s: %s" % (exi, score), flush=True) - score_dump.append(ex + [prompt, res, score]) - # dump every score in case abort - df_scores = pd.DataFrame(score_dump, - columns=eval_func_param_names + eval_extra_columns) - df_scores.to_parquet(eval_filename, index=False) - # plot histogram so far - plt.figure(figsize=(10, 10)) - plt.hist(df_scores['score'], bins=20) - score_avg = np.mean(df_scores['score']) - score_median = np.median(df_scores['score']) - plt.title("Score avg: %s median: %s" % (score_avg, score_median)) - plt.savefig(eval_filename.replace('.parquet', '.png')) - plt.close() - - print("END" + "=" * 102) - print("") - t2 = time.time() - print("Time taken so far: %.4f about %.4g per example" % (t2 - t0, (t2 - t0) / (1 + exi))) - t1 = time.time() - print("Total time taken: %.4f about %.4g per example" % (t1 - t0, (t1 - t0) / num_examples)) + from eval import run_eval + eval_filename = run_eval(**get_kwargs(run_eval, exclude_names=['model_state0'], **locals())) return eval_filename if gradio: @@ -533,14 +387,14 @@ def get_response(*args, exi=0): # get default model all_kwargs = locals().copy() if all_kwargs.get('base_model') and not all_kwargs['login_mode_if_model0']: - model0, tokenizer0, device = get_model(**all_kwargs) + model0, tokenizer0, device = get_model(reward_type=False, **get_kwargs(get_model, exclude_names=['reward_type'], **all_kwargs)) else: # if empty model, then don't load anything, just get gradio up model0, tokenizer0, device = None, None, None model_state0 = [model0, tokenizer0, device, all_kwargs['base_model']] # get score model - smodel, stokenizer, sdevice = get_score_model(**all_kwargs) + smodel, stokenizer, sdevice = get_score_model(reward_type=True, **get_kwargs(get_score_model, exclude_names=['reward_type'], **all_kwargs)) score_model_state0 = [smodel, stokenizer, sdevice, score_model] if enable_captions: @@ -552,6 +406,7 @@ def get_response(*args, exi=0): else: caption_loader = False + # assume gradio needs everything go_gradio(**locals()) @@ -669,8 +524,7 @@ def get_model( use_auth_token: Union[str, bool] = False, trust_remote_code: bool = True, offload_folder: str = None, - compile: bool = True, - **kwargs, + compile_model: bool = True, ): """ @@ -690,8 +544,7 @@ def get_model( :param use_auth_token: assumes user did on CLI `huggingface-cli login` to access private repo :param trust_remote_code: trust code needed by model :param offload_folder: offload folder - :param compile: whether to compile torch model - :param kwargs: + :param compile_model: whether to compile torch model :return: """ print("Get %s model" % base_model, flush=True) @@ -835,7 +688,7 @@ def get_model( if not isinstance(tokenizer, str): model.eval() - if torch.__version__ >= "2" and sys.platform != "win32" and compile: + if torch.__version__ >= "2" and sys.platform != "win32" and compile_model: model = torch.compile(model) return model, tokenizer, device @@ -854,19 +707,34 @@ def pop_unused_model_kwargs(model_kwargs): model_kwargs.pop(k) -def get_score_model(**kwargs): - # score model - if kwargs.get('score_model') is not None and kwargs.get('score_model').strip(): - score_all_kwargs = kwargs.copy() - score_all_kwargs['load_8bit'] = False - score_all_kwargs['load_4bit'] = False - score_all_kwargs['load_half'] = False - score_all_kwargs['base_model'] = kwargs.get('score_model').strip() - score_all_kwargs['tokenizer_base_model'] = '' - score_all_kwargs['lora_weights'] = '' - score_all_kwargs['llama_type'] = False - score_all_kwargs['compile'] = False - smodel, stokenizer, sdevice = get_model(**score_all_kwargs) +def get_score_model(score_model: str = None, + load_8bit: bool = False, + load_4bit: bool = False, + load_half: bool = True, + infer_devices: bool = True, + base_model: str = '', + tokenizer_base_model: str = '', + lora_weights: str = "", + gpu_id: int = 0, + + reward_type: bool = None, + local_files_only: bool = False, + resume_download: bool = True, + use_auth_token: Union[str, bool] = False, + trust_remote_code: bool = True, + offload_folder: str = None, + compile_model: bool = True, + ): + if score_model is not None and score_model.strip(): + load_8bit = False + load_4bit = False + load_half = False + base_model = score_model.strip() + tokenizer_base_model = '' + lora_weights = '' + llama_type = False + compile_model = False + smodel, stokenizer, sdevice = get_model(**get_kwargs(get_model, **locals())) else: smodel, stokenizer, sdevice = None, None, None return smodel, stokenizer, sdevice diff --git a/gradio_runner.py b/gradio_runner.py index 562d0c0c0..35c2c4d5e 100644 --- a/gradio_runner.py +++ b/gradio_runner.py @@ -15,7 +15,7 @@ from prompter import Prompter, \ prompt_type_to_model_name, prompt_types_strings, inv_prompt_type_to_model_lower, generate_prompt from utils import get_githash, flatten_list, zip_data, s3up, clear_torch_cache, get_torch_allocated, system_info_print, \ - ping, get_short_name, get_url, makedirs + ping, get_short_name, get_url, makedirs, get_kwargs from generate import get_model, languages_covered, evaluate, eval_func_param_names, score_qa, langchain_modes, \ inputs_kwargs_list, get_cutoffs, scratch_base_dir @@ -32,7 +32,6 @@ def go_gradio(**kwargs): admin_pass = kwargs['admin_pass'] model_state0 = kwargs['model_state0'] score_model_state0 = kwargs['score_model_state0'] - queue = True dbs = kwargs['dbs'] db_type = kwargs['db_type'] visible_langchain_modes = kwargs['visible_langchain_modes'] @@ -41,7 +40,6 @@ def go_gradio(**kwargs): enable_sources_list = kwargs['enable_sources_list'] enable_url_upload = kwargs['enable_url_upload'] enable_text_upload = kwargs['enable_text_upload'] - allow_upload = allow_upload_to_user_data or allow_upload_to_my_data use_openai_embedding = kwargs['use_openai_embedding'] hf_embedding_model = kwargs['hf_embedding_model'] enable_captions = kwargs['enable_captions'] @@ -50,6 +48,8 @@ def go_gradio(**kwargs): caption_loader = kwargs['caption_loader'] # easy update of kwargs needed for evaluate() etc. + queue = True + allow_upload = allow_upload_to_user_data or allow_upload_to_my_data kwargs.update(locals()) if 'mbart-' in kwargs['model_lower']: @@ -1237,7 +1237,7 @@ def load_model(model_name, lora_weights, model_state_old, prompt_type_old, load_ lora_weights = '' all_kwargs1['lora_weights'] = lora_weights.strip() - model1, tokenizer1, device1 = get_model(**all_kwargs1) + model1, tokenizer1, device1 = get_model(**get_kwargs(get_model, all_kwargs1)) clear_torch_cache() if kwargs['debug']: diff --git a/utils.py b/utils.py index 775954b48..e1c7add6c 100644 --- a/utils.py +++ b/utils.py @@ -1,6 +1,7 @@ import contextlib import functools import hashlib +import inspect import os import gc import pathlib @@ -785,3 +786,17 @@ def print_progress(self): self._pbar.total = self.n_dispatched_tasks self._pbar.n = self.n_completed_tasks self._pbar.refresh() + + +def get_kwargs(func, exclude_names=None, **kwargs): + func_names = list(inspect.signature(func).parameters) + missing_kwargs = [x for x in func_names if x not in kwargs] + if exclude_names: + for k in exclude_names: + if k in missing_kwargs: + missing_kwargs.remove(k) + if k in func_names: + func_names.remove(k) + assert not missing_kwargs, "Missing %s" % missing_kwargs + kwargs = {k: v for k, v in kwargs.items() if k in func_names} + return kwargs