Merge pull request #197 from h2oai/fixtestllamacpp

Fix and test llamacpp
h2oai · May 29, 2023 · 793916a · 793916a
2 parents 7b77558 + a701ff8
commit 793916a
Show file tree

Hide file tree

Showing 13 changed files with 236 additions and 33 deletions.
diff --git a/.env_gpt4all b/.env_gpt4all
@@ -1,2 +1,10 @@
-# GPT4ALL model_kwargs
-model_path_gptj=ggml-gpt4all-j-v1.3-groovy.bin
+# GPT4ALL or llama-cpp-python model_kwargs
+
+# GPT4ALl GPT-J type, from model explorer choice, so downloads
+model_name_gptj=ggml-gpt4all-j-v1.3-groovy.bin
+
+# llama-cpp-python type, supporting version 3 quantization, here from locally built llama.cpp q4 v3 quantization
+model_path_llama=./models/7B/ggml-model-q4_0.bin
+
+# GPT4ALl LLaMa type, supporting version 2 quantization, here from model explorer choice so downloads
+model_name_gpt4all_llama=ggml-wizardLM-7B.q4_2.bin
diff --git a/FAQ.md b/FAQ.md
@@ -256,16 +256,79 @@ python generate.py --base_model=h2oai/h2ogpt-oasst1-512-20b --load_8bit=True
 ```
 etc.
 
-### CPU with no AVX2
+### CPU with no AVX2 or using LLaMa.cpp
 
 For GPT4All based models, require AVX2, unless one recompiles that project on your system.  Until then, use llama.cpp models instead,
-e.g. by compiling the llama model on your system by following the [instructions](https://github.com/ggerganov/llama.cpp#description),
-then adding an entry in the .env file like:
+e.g. by compiling the llama model on your system by following the [instructions](https://github.com/ggerganov/llama.cpp#build) and [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), e.g. for Linux:
+```bash
+git clone https://github.com/ggerganov/llama.cpp
+cd llama.cpp
+make clean
+make LLAMA_OPENBLAS=1
+```
+on CPU, or for GPU:
+```bash
+git clone https://github.com/ggerganov/llama.cpp
+cd llama.cpp
+make clean
+make LLAMA_CUBLAS=1
+```
+etc. following different [scenarios](https://github.com/ggerganov/llama.cpp#build).
+
+Then:
+```bash
+# obtain the original LLaMA model weights and place them in ./models, i.e. models should contain:
+# 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
+
+# install Python dependencies
+conda create -n llamacpp -y
+conda activate llamacpp
+conda install python=3.10 -y
+pip install -r requirements.txt
+
+# convert the 7B model to ggml FP16 format
+python convert.py models/7B/
+
+# quantize the model to 4-bits (using q4_0 method)
+./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0
+
+# test by running the inference
+./main -m ./models/7B/ggml-model-q4_0.bin -n 128
+```
+then adding an entry in the .env file like (assumes version 3 quantization)
 ```.env_gpt4all
 # model path and model_kwargs
 model_path_llama=./models/7B/ggml-model-q4_0.bin
 ```
-or wherever you placed the model.
+or wherever you placed the model with the path pointing to wherever the files are located (e.g. link from h2oGPT repo to llama.cpp repo folder), e.g.
+```bash
+cd ~/h2ogpt/
+ln -s ~/llama.cpp/models/* .
+```
+then run h2oGPT like:
+```bash
+python generate.py --base_model='llama' --langchain_mode=UserData --user_path=user_path
+```
+
+### is this really a GGML file? Or Using version 2 quantization files from GPT4All that are LLaMa based
+
+If hit error:
+```text
+Found model file.
+llama.cpp: loading model from ./models/7B/ggml-model-q4_0.bin
+error loading model: unknown (magic, version) combination: 67676a74, 00000003; is this really a GGML file?
+llama_init_from_file: failed to load model
+LLAMA ERROR: failed to load model from ./models/7B/ggml-model-q4_0.bin
+```
+then note that llama.cpp upgraded to version 3, and we use llama-cpp-python version that supports only that latest version 3.  GPT4All does not support version 3 yet.  If you want to support older version 2 llama quantized models, then do:
+```bash
+pip install --force-reinstall --ignore-installed --no-cache-dir llama-cpp-python==0.1.48
+```
+to go back to the prior version.  Or specify the model using GPT4All as `--base_model='gpt4all_llama` and ensure entry exists like:
+```.env_gpt4all
+model_path_gpt4all_llama=./models/7B/ggml-model-q4_0.bin
+```
+assuming that file is from version 2 quantization.
 
 ### I get the error: `The model 'OptimizedModule' is not supported for . Supported models are ...`
 

diff --git a/INSTALL.md b/INSTALL.md
@@ -14,7 +14,7 @@ conda create -n h2ogpt -y
 conda activate h2ogpt
 conda install mamba -n base -c conda-forge
 conda install python=3.10 -y
-conda update -n base -c defaults conda
+conda update -n base -c defaults conda -y
 ```
 Enter new shell and should also see `(base)` in prompt.  Then, create new env:
 ```bash

diff --git a/cli.py b/cli.py
@@ -2,6 +2,7 @@
 import torch
 
 from generate import eval_func_param_names, get_score_model, get_model, evaluate
+from prompter import non_hf_types
 from utils import clear_torch_cache, NullContext, get_kwargs
 
 
@@ -72,7 +73,7 @@ def run_cli(  # for local function:
             outr = ''
             res_old = ''
             for res, extra in gener:
-                if base_model not in ['gptj', 'llama']:
+                if base_model not in non_hf_types:
                     if not stream_output:
                         print(res)
                     else:

diff --git a/generate.py b/generate.py
@@ -35,7 +35,7 @@
 from transformers import GenerationConfig, AutoModel, TextIteratorStreamer
 from accelerate import init_empty_weights, infer_auto_device_map
 
-from prompter import Prompter, inv_prompt_type_to_model_lower
+from prompter import Prompter, inv_prompt_type_to_model_lower, non_hf_types
 from stopping import get_stopping
 
 eval_extra_columns = ['prompt', 'response', 'score']
@@ -572,7 +572,7 @@ def get_model(
     """
     if verbose:
         print("Get %s model" % base_model, flush=True)
-    if base_model in ['llama', 'gptj']:
+    if base_model in non_hf_types:
         from gpt4all_llm import get_model_tokenizer_gpt4all
         model, tokenizer, device = get_model_tokenizer_gpt4all(base_model)
         return model, tokenizer, device
@@ -924,8 +924,7 @@ def evaluate(
         db1 = dbs[langchain_mode]
     else:
         db1 = None
-    if langchain_mode not in [False, 'Disabled', 'ChatLLM', 'LLM'] and db1 is not None or base_model in ['llama',
-                                                                                                         'gptj']:
+    if langchain_mode not in [False, 'Disabled', 'ChatLLM', 'LLM'] and db1 is not None or base_model in non_hf_types:
         query = instruction if not iinput else "%s\n%s" % (instruction, iinput)
         outr = ""
         # use smaller cut_distanct for wiki_full since so many matches could be obtained, and often irrelevant unless close
@@ -967,7 +966,7 @@ def evaluate(
                 print(
                     'Post-Generate Langchain: %s decoded_output: %s' % (str(datetime.now()), len(outr) if outr else -1),
                     flush=True)
-        if outr or base_model in ['llama', 'gptj']:
+        if outr or base_model in non_hf_types:
             # if got no response (e.g. not showing sources and got no sources,
             # so nothing to give to LLM), then slip through and ask LLM
             # Or if llama/gptj, then just return since they had no response and can't go down below code path

diff --git a/gpt4all_llm.py b/gpt4all_llm.py
@@ -34,17 +34,25 @@ def get_model_tokenizer_gpt4all(base_model, **kwargs):
         if 'model_path_llama' not in model_kwargs:
             raise ValueError("No model_path_llama in %s" % env_gpt4all_file)
         model_path = model_kwargs.pop('model_path_llama')
+        # FIXME: GPT4All version of llama doesn't handle new quantization, so use llama_cpp_python
+        from llama_cpp import Llama
+        model = Llama(model_path=model_path)
+    elif base_model in "gpt4all_llama":
+        if 'model_name_gpt4all_llama' not in model_kwargs and 'model_path_gpt4all_llama' not in model_kwargs:
+            raise ValueError("No model_name_gpt4all_llama or model_path_gpt4all_llama in %s" % env_gpt4all_file)
+        model_name = model_kwargs.pop('model_name_gpt4all_llama')
+        model_type = 'llama'
         from gpt4all import GPT4All as GPT4AllModel
-    elif base_model == "gptj":
-        if 'model_path_gptj' not in model_kwargs:
-            raise ValueError("No model_path_gptj in %s" % env_gpt4all_file)
-        model_path = model_kwargs.pop('model_path_gptj')
+        model = GPT4AllModel(model_name=model_name, model_type=model_type)
+    elif base_model in "gptj":
+        if 'model_name_gptj' not in model_kwargs and 'model_path_gptj' not in model_kwargs:
+            raise ValueError("No model_name_gpt4j or model_path_gpt4j in %s" % env_gpt4all_file)
+        model_name = model_kwargs.pop('model_name_gptj')
+        model_type = 'gptj'
         from gpt4all import GPT4All as GPT4AllModel
+        model = GPT4AllModel(model_name=model_name, model_type=model_type)
     else:
         raise ValueError("No such base_model %s" % base_model)
-    func_names = list(inspect.signature(GPT4AllModel).parameters)
-    model_kwargs = {k: v for k, v in model_kwargs.items() if k in func_names}
-    model = GPT4AllModel(model_path, **model_kwargs)
     return model, FakeTokenizer(), 'cpu'
 
 
@@ -75,14 +83,20 @@ def get_llm_gpt4all(model_name, model=None,
                       'repeat_last_n': 64 if repetition_penalty != 1.0 else 0, 'repeat_penalty': repetition_penalty,
                       'temp': temperature, 'top_k': top_k, 'top_p': top_p}
     if model_name == 'llama':
-        from langchain.llms import LlamaCpp
         model_path = model_kwargs.pop('model_path_llama') if model is None else model
-        llm = LlamaCpp(model_path=model_path, n_ctx=n_ctx, callbacks=callbacks, verbose=False)
-    else:
+        llm = H2OLlamaCpp(model_path=model_path, n_ctx=n_ctx, callbacks=callbacks, verbose=False)
+    elif model_name == 'gpt4all_llama':
+        model_path = model_kwargs.pop('model_path_gpt4all_llama') if model is None else model
+        llm = H2OGPT4All(model=model_path, backend='llama', callbacks=callbacks,
+                         verbose=False, **default_params,
+                         )
+    elif model_name == 'gptj':
         model_path = model_kwargs.pop('model_path_gptj') if model is None else model
         llm = H2OGPT4All(model=model_path, backend='gptj', callbacks=callbacks,
                          verbose=False, **default_params,
                          )
+    else:
+        raise RuntimeError("No such model_name %s" % model_name)
     return llm
 
 
@@ -130,3 +144,68 @@ def _call(
         if verbose:
             print("_call prompt: %s" % prompt, flush=True)
         return super()._call(prompt, stop=stop, run_manager=run_manager)
+
+
+from langchain.llms import LlamaCpp
+
+
+class H2OLlamaCpp(LlamaCpp):
+    model_path: Any
+    """Path to the pre-trained GPT4All model file."""
+
+    @root_validator()
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that llama-cpp-python library is installed."""
+        if isinstance(values["model_path"], str):
+            model_path = values["model_path"]
+            model_param_names = [
+                "lora_path",
+                "lora_base",
+                "n_ctx",
+                "n_parts",
+                "seed",
+                "f16_kv",
+                "logits_all",
+                "vocab_only",
+                "use_mlock",
+                "n_threads",
+                "n_batch",
+                "use_mmap",
+                "last_n_tokens_size",
+            ]
+            model_params = {k: values[k] for k in model_param_names}
+            # For backwards compatibility, only include if non-null.
+            if values["n_gpu_layers"] is not None:
+                model_params["n_gpu_layers"] = values["n_gpu_layers"]
+
+            try:
+                from llama_cpp import Llama
+
+                values["client"] = Llama(model_path, **model_params)
+            except ImportError:
+                raise ModuleNotFoundError(
+                    "Could not import llama-cpp-python library. "
+                    "Please install the llama-cpp-python library to "
+                    "use this embedding model: pip install llama-cpp-python"
+                )
+            except Exception as e:
+                raise ValueError(
+                    f"Could not load Llama model from path: {model_path}. "
+                    f"Received error {e}"
+                )
+        else:
+            values["client"] = values["model_path"]
+        return values
+
+    def _call(
+            self,
+            prompt: str,
+            stop: Optional[List[str]] = None,
+            run_manager: Optional[CallbackManagerForLLMRun] = None,
+    ) -> str:
+        # Roughly 4 chars per token if natural language
+        prompt = prompt[-self.n_ctx * 4:]
+        verbose = False
+        if verbose:
+            print("_call prompt: %s" % prompt, flush=True)
+        return super()._call(prompt, stop=stop, run_manager=run_manager)
diff --git a/gpt_langchain.py b/gpt_langchain.py
@@ -19,6 +19,7 @@
 from joblib import Parallel, delayed
 from tqdm import tqdm
 
+from prompter import non_hf_types
 from utils import wrapped_partial, EThread, import_matplotlib, sanitize_filename, makedirs, get_url, flatten_list, \
     get_device, ProgressParallel
 
@@ -136,7 +137,7 @@ def get_llm(use_openai_model=False, model_name=None, model=None,
         model_name = 'openai'
         streamer = None
         prompt_type = 'plain'
-    elif model_name in ['gptj', 'llama']:
+    elif model_name in non_hf_types:
         from gpt4all_llm import get_llm_gpt4all
         llm = get_llm_gpt4all(model_name, model=model, max_new_tokens=max_new_tokens,
                               temperature=temperature,
@@ -916,7 +917,7 @@ def _run_qa_db(query=None,
                                                          prompter=prompter,
                                                          )
 
-    if model_name in ['llama', 'gptj']:
+    if model_name in non_hf_types:
         # FIXME: for now, streams to stdout/stderr currently
         stream_output = False
 
@@ -1002,7 +1003,7 @@ def get_similarity_chain(query=None,
                          llm=None,
                          verbose=False,
                          ):
-    if not use_openai_model and prompt_type not in ['plain'] or model_name in ['llama', 'gptj']:
+    if not use_openai_model and prompt_type not in ['plain'] or model_name in non_hf_types:
         # instruct-like, rather than few-shot prompt_type='plain' as default
         # but then sources confuse the model with how inserted among rest of text, so avoid
         prefix = ""

diff --git a/gradio_runner.py b/gradio_runner.py
@@ -26,7 +26,7 @@ def my_get(url, **kwargs):
 
 from gradio_themes import H2oTheme, SoftTheme, get_h2o_title, get_simple_title, get_dark_js
 from prompter import Prompter, \
-    prompt_type_to_model_name, prompt_types_strings, inv_prompt_type_to_model_lower, generate_prompt
+    prompt_type_to_model_name, prompt_types_strings, inv_prompt_type_to_model_lower, generate_prompt, non_hf_types
 from utils import get_githash, flatten_list, zip_data, s3up, clear_torch_cache, get_torch_allocated, system_info_print, \
     ping, get_short_name, get_url, makedirs, get_kwargs
 from generate import get_model, languages_covered, evaluate, eval_func_param_names, score_qa, langchain_modes, \
@@ -1410,7 +1410,7 @@ def count_chat_tokens(model_state1, chat1, prompt_type1):
     scheduler = BackgroundScheduler()
     scheduler.add_job(func=clear_torch_cache, trigger="interval", seconds=20)
     if is_public and \
-            kwargs['base_model'] not in ['gptj', 'llama']:
+            kwargs['base_model'] not in non_hf_types:
         # FIXME: disable for gptj, langchain or gpt4all modify print itself
         # FIXME: and any multi-threaded/async print will enter model output!
         scheduler.add_job(func=ping, trigger="interval", seconds=60)
@@ -1419,7 +1419,7 @@ def count_chat_tokens(model_state1, chat1, prompt_type1):
     # import control
     if kwargs['langchain_mode'] == 'Disabled' and \
             os.environ.get("TEST_LANGCHAIN_IMPORT") and \
-            kwargs['base_model'] not in ['gptj', 'llama']:
+            kwargs['base_model'] not in non_hf_types:
         assert 'gpt_langchain' not in sys.modules, "Dev bug, import of langchain when should not have"
         assert 'langchain' not in sys.modules, "Dev bug, import of langchain when should not have"
 

diff --git a/prompter.py b/prompter.py
@@ -2,6 +2,9 @@
 from enum import Enum
 
 
+non_hf_types = ['gpt4all_llama', 'llama', 'gptj']
+
+
 class PromptType(Enum):
     plain = 0
     instruct = 1

diff --git a/requirements.txt b/requirements.txt
@@ -21,9 +21,6 @@ bitsandbytes==0.39.0
 accelerate==0.19.0
 git+https://github.com/huggingface/peft.git@3714aa2fff158fdfa637b2b65952580801d890b2
 transformers==4.28.1
-# below can be used for 4-bit training
-# git+https://github.com/huggingface/accelerate.git@0226f750257b3bf2cadc4f189f9eef0c764a0467
-# git+https://github.com/huggingface/transformers.git@f67dac97bdc63874f2288546b3fa87e69d2ea1c8
 tokenizers==0.13.3
 APScheduler==3.10.1
 

diff --git a/requirements_optional_gpt4all.txt b/requirements_optional_gpt4all.txt
@@ -1,3 +1,3 @@
 gpt4all==0.2.3
-llama-cpp-python==0.1.54
+llama-cpp-python==0.1.55
 python-dotenv==1.0.0