Skip to content

Commit

Permalink
Merge pull request #212 from h2oai/allow_add_to_db_from_generate
Browse files Browse the repository at this point in the history
Allow add to db when loading from generate
  • Loading branch information
pseudotensor authored Jun 3, 2023
2 parents 3434c74 + fbce29e commit b3135a4
Show file tree
Hide file tree
Showing 9 changed files with 258 additions and 72 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ model_name_gpt4all_llama=ggml-wizardLM-7B.q4_2.bin
```
For `gptj` and `gpt4all_llama`, you can choose a different model than our default choice by going to GPT4All Model explorer [GPT4All-J compatible model](https://gpt4all.io/index.html). One does not need to download manually, the gp4all package will download at runtime and put it into `.cache` like huggingface would. However, `gpjt` model often gives [no output](docs/FAQ.md#gpt4all-not-producing-output), even outside h2oGPT.

So, for chatting, a better instruct fine-tuned LLaMa-based model for llama.cpp can be downloaded from [TheBloke](https://huggingface.co/TheBloke). For example, [13B WizardLM Quantized](https://huggingface.co/TheBloke/wizardLM-13B-1.0-GGML) or [7B WizardLM Quantized](https://huggingface.co/TheBloke/WizardLM-7B-uncensored-GGML). TheBloke has a variety of model types, quantization bit, and memory consumption. Choose what is best for your system's specs. However, be aware that LLaMa-based models are not [commercially viable](docs/FAQ.md#commercial-viability).
So, for chatting, a better instruct fine-tuned LLaMa-based model for llama.cpp can be downloaded from [TheBloke](https://huggingface.co/TheBloke). For example, [13B WizardLM Quantized](https://huggingface.co/TheBloke/wizardLM-13B-1.0-GGML) or [7B WizardLM Quantized](https://huggingface.co/TheBloke/WizardLM-7B-uncensored-GGML). TheBloke has a variety of model types, quantization bit depths, and memory consumption. Choose what is best for your system's specs. However, be aware that LLaMa-based models are not [commercially viable](docs/FAQ.md#commercial-viability).

For 7B case, download [WizardLM-7B-uncensored.ggmlv3.q8_0.bin](https://huggingface.co/TheBloke/WizardLM-7B-uncensored-GGML/blob/main/WizardLM-7B-uncensored.ggmlv3.q8_0.bin) into local path. Then one sets `model_path_llama` in `.env_gpt4all`, which is currently the default.

Expand Down
5 changes: 4 additions & 1 deletion cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import copy
import torch

from generate import eval_func_param_names, get_score_model, get_model, evaluate
from generate import eval_func_param_names, get_score_model, get_model, evaluate, check_locals
from prompter import non_hf_types
from utils import clear_torch_cache, NullContext, get_kwargs

Expand All @@ -22,11 +22,14 @@ def run_cli( # for local function:
# for evaluate kwargs
src_lang=None, tgt_lang=None, concurrency_count=None, save_dir=None, sanitize_bot_response=None,
model_state0=None, raise_generate_gpu_exceptions=None, load_db_if_exists=None, dbs=None, user_path=None,
detect_user_path_changes_every_query=None,
use_openai_embedding=None, use_openai_model=None, hf_embedding_model=None, chunk=None, chunk_size=None,
db_type=None, n_jobs=None, first_para=None, text_limit=None, verbose=None, cli=None,
# unique to this function:
cli_loop=None,
):
check_locals(**locals())

score_model = "" # FIXME: For now, so user doesn't have to pass
n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0
device = 'cpu' if n_gpus == 0 else 'cuda'
Expand Down
8 changes: 8 additions & 0 deletions docs/README_LangChain.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ pip install -r reqs_optional/requirements_optional_langchain.gpllike.txt
```
but pymupdf is AGPL, requiring any source code be made available, which is not an issue directly for h2oGPT, but it's like GPL and too strong a constraint for general commercial use.

When pymupdf is installed, we will use `PyMuPDFLoader` by default to parse PDFs since better than `PyPDFLoader` and much better than `PDFMinerLoader`. This can be overridden by setting `PDF_CLASS_NAME=PyPDFLoader` in `.env_gpt4all`.

## Database creation

To use some example databases (will overwrite UserData make above unless change options) and run generate after, do:
Expand All @@ -98,6 +100,12 @@ python make_db.py --add_if_exists=True
python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-512-6_9b --langchain_mode=UserData
```

By default, `generate.py` will load an existing UserData database and add any documents added to user_path or change any files that have changed. To avoid detecting any new files, just avoid passing --user_path=user_path, which sets it to None, i.e.:
```bash
python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-512-6_9b --langchain_mode=UserData
```
which will avoid using `user_path` since it is no longer passed. Otherwise when passed, any new files will be added or changed (by hash) files will be updated (delete old sources and add new sources).

## Document Question-Answer FAQ

#### Why does the source link not work?
Expand Down
27 changes: 3 additions & 24 deletions eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from matplotlib import pyplot as plt

from generate import eval_func_param_names, eval_extra_columns, get_context, get_score_model, get_model, evaluate, \
inputs_kwargs_list
inputs_kwargs_list, check_locals
from prompter import Prompter
from utils import clear_torch_cache, NullContext, get_kwargs

Expand Down Expand Up @@ -39,32 +39,11 @@ def run_eval( # for local function:
# for evaluate kwargs:
src_lang=None, tgt_lang=None, concurrency_count=None, save_dir=None, sanitize_bot_response=None,
model_state0=None, raise_generate_gpu_exceptions=None, load_db_if_exists=None, dbs=None, user_path=None,
detect_user_path_changes_every_query=None,
use_openai_embedding=None, use_openai_model=None, hf_embedding_model=None, chunk=None, chunk_size=None,
db_type=None, n_jobs=None, first_para=None, text_limit=None, verbose=None, cli=None,
):
# ensure everything in evaluate is here
can_skip_because_locally_generated = [ # evaluate
'instruction',
'iinput',
'context',
'instruction_nochat',
'iinput_nochat',
# get_model:
'reward_type'
]
for k in eval_func_param_names:
if k in can_skip_because_locally_generated:
continue
assert k in locals(), "Missing %s" % k
for k in inputs_kwargs_list:
if k in can_skip_because_locally_generated:
continue
assert k in locals(), "Missing %s" % k

for k in list(inspect.signature(get_model).parameters):
if k in can_skip_because_locally_generated:
continue
assert k in locals(), "Missing %s" % k
check_locals(**locals())

if eval_prompts_only_num > 0:
np.random.seed(eval_prompts_only_seed)
Expand Down
44 changes: 39 additions & 5 deletions generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ def main(
stream_output: bool = True,
show_examples: bool = None,
verbose: bool = False,
h2ocolors: bool = True,
height: int = 400,
h2ocolors: bool = False,
height: int = 600,
show_lora: bool = True,
login_mode_if_model0: bool = False,
block_gradio_exit: bool = True,
Expand Down Expand Up @@ -122,6 +122,7 @@ def main(
visible_langchain_modes: list = ['UserData', 'MyData'],
document_choice: list = ['All'],
user_path: str = None,
detect_user_path_changes_every_query: bool = False,
load_db_if_exists: bool = True,
keep_sources_in_context: bool = False,
db_type: str = 'chroma',
Expand Down Expand Up @@ -214,7 +215,10 @@ def main(
:param eval_as_output: for no gradio benchmark, whether to test eval_filename output itself
:param langchain_mode: Data source to include. Choose "UserData" to only consume files from make_db.py.
WARNING: wiki_full requires extra data processing via read_wiki_full.py and requires really good workstation to generate db, unless already present.
:param user_path: user path to glob from to generate db for vector search, for 'UserData' langchain mode
:param user_path: user path to glob from to generate db for vector search, for 'UserData' langchain mode.
If already have db, any new/changed files are added automatically if path set, does not have to be same path used for prior db sources
:param detect_user_path_changes_every_query: whether to detect if any files changed or added every similarity search (by file hashes).
Expensive for large number of files, so not done by default. By default only detect changes during db loading.
:param visible_langchain_modes: dbs to generate at launch to be ready for LLM
Can be up to ['wiki', 'wiki_full', 'UserData', 'MyData', 'github h2oGPT', 'DriverlessAI docs']
But wiki_full is expensive and requires preparation
Expand All @@ -231,7 +235,7 @@ def main(
:param allow_upload_to_user_data: Whether to allow file uploads to update shared vector db
:param allow_upload_to_my_data: Whether to allow file uploads to update scratch vector db
:param enable_url_upload: Whether to allow upload from URL
:param enable_text_upload: Whether to allow uplaod of text
:param enable_text_upload: Whether to allow upload of text
:param enable_sources_list: Whether to allow list (or download for non-shared db) of list of sources for chosen db
:param chunk: Whether to chunk data (True unless know data is already optimally chunked)
:param chunk_size: Size of chunks, with typically top-4 passed to LLM, so neesd to be in context length
Expand Down Expand Up @@ -379,7 +383,9 @@ def main(
# FIXME: All should be avoided until scans over each db, shouldn't be separate db
continue
persist_directory1 = 'db_dir_%s' % langchain_mode1 # single place, no special names for each case
db = prep_langchain(persist_directory1, load_db_if_exists, db_type, use_openai_embedding,
db = prep_langchain(persist_directory1,
load_db_if_exists,
db_type, use_openai_embedding,
langchain_mode1, user_path,
hf_embedding_model,
kwargs_make_db=locals())
Expand Down Expand Up @@ -843,6 +849,7 @@ def evaluate(
load_db_if_exists=True,
dbs=None,
user_path=None,
detect_user_path_changes_every_query=None,
use_openai_embedding=None,
use_openai_model=None,
hf_embedding_model=None,
Expand Down Expand Up @@ -938,6 +945,7 @@ def evaluate(
load_db_if_exists=load_db_if_exists,
db=db1,
user_path=user_path,
detect_user_path_changes_every_query=detect_user_path_changes_every_query,
max_new_tokens=max_new_tokens,
cut_distanct=1.1 if langchain_mode in ['wiki_full'] else 1.64, # FIXME, too arbitrary
use_openai_embedding=use_openai_embedding,
Expand Down Expand Up @@ -1473,6 +1481,32 @@ def score_qa(smodel, stokenizer, max_length_tokenize, question, answer, cutoff_l
return score


def check_locals(**kwargs):
# ensure everything in evaluate is here
can_skip_because_locally_generated = [ # evaluate
'instruction',
'iinput',
'context',
'instruction_nochat',
'iinput_nochat',
# get_model:
'reward_type'
]
for k in eval_func_param_names:
if k in can_skip_because_locally_generated:
continue
assert k in kwargs, "Missing %s" % k
for k in inputs_kwargs_list:
if k in can_skip_because_locally_generated:
continue
assert k in kwargs, "Missing %s" % k

for k in list(inspect.signature(get_model).parameters):
if k in can_skip_because_locally_generated:
continue
assert k in kwargs, "Missing %s" % k


if __name__ == "__main__":
"""
Examples:
Expand Down
Loading

0 comments on commit b3135a4

Please sign in to comment.