Merge pull request #212 from h2oai/allow_add_to_db_from_generate

Allow add to db when loading from generate
h2oai · Jun 3, 2023 · b3135a4 · b3135a4
2 parents 3434c74 + fbce29e
commit b3135a4
Show file tree

Hide file tree

Showing 9 changed files with 258 additions and 72 deletions.
diff --git a/README.md b/README.md
@@ -174,7 +174,7 @@ model_name_gpt4all_llama=ggml-wizardLM-7B.q4_2.bin
 ```
 For `gptj` and `gpt4all_llama`, you can choose a different model than our default choice by going to GPT4All Model explorer [GPT4All-J compatible model](https://gpt4all.io/index.html). One does not need to download manually, the gp4all package will download at runtime and put it into `.cache` like huggingface would.  However, `gpjt` model often gives [no output](docs/FAQ.md#gpt4all-not-producing-output), even outside h2oGPT.
 
-So, for chatting, a better instruct fine-tuned LLaMa-based model for llama.cpp can be downloaded from [TheBloke](https://huggingface.co/TheBloke).  For example, [13B WizardLM Quantized](https://huggingface.co/TheBloke/wizardLM-13B-1.0-GGML) or [7B WizardLM Quantized](https://huggingface.co/TheBloke/WizardLM-7B-uncensored-GGML).  TheBloke has a variety of model types, quantization bit, and memory consumption.  Choose what is best for your system's specs.  However, be aware that LLaMa-based models are not [commercially viable](docs/FAQ.md#commercial-viability).
+So, for chatting, a better instruct fine-tuned LLaMa-based model for llama.cpp can be downloaded from [TheBloke](https://huggingface.co/TheBloke).  For example, [13B WizardLM Quantized](https://huggingface.co/TheBloke/wizardLM-13B-1.0-GGML) or [7B WizardLM Quantized](https://huggingface.co/TheBloke/WizardLM-7B-uncensored-GGML).  TheBloke has a variety of model types, quantization bit depths, and memory consumption.  Choose what is best for your system's specs.  However, be aware that LLaMa-based models are not [commercially viable](docs/FAQ.md#commercial-viability).
 
 For 7B case, download [WizardLM-7B-uncensored.ggmlv3.q8_0.bin](https://huggingface.co/TheBloke/WizardLM-7B-uncensored-GGML/blob/main/WizardLM-7B-uncensored.ggmlv3.q8_0.bin) into local path.  Then one sets `model_path_llama` in `.env_gpt4all`, which is currently the default.
 

diff --git a/cli.py b/cli.py
@@ -1,7 +1,7 @@
 import copy
 import torch
 
-from generate import eval_func_param_names, get_score_model, get_model, evaluate
+from generate import eval_func_param_names, get_score_model, get_model, evaluate, check_locals
 from prompter import non_hf_types
 from utils import clear_torch_cache, NullContext, get_kwargs
 
@@ -22,11 +22,14 @@ def run_cli(  # for local function:
         # for evaluate kwargs
         src_lang=None, tgt_lang=None, concurrency_count=None, save_dir=None, sanitize_bot_response=None,
         model_state0=None, raise_generate_gpu_exceptions=None, load_db_if_exists=None, dbs=None, user_path=None,
+        detect_user_path_changes_every_query=None,
         use_openai_embedding=None, use_openai_model=None, hf_embedding_model=None, chunk=None, chunk_size=None,
         db_type=None, n_jobs=None, first_para=None, text_limit=None, verbose=None, cli=None,
         # unique to this function:
         cli_loop=None,
 ):
+    check_locals(**locals())
+
     score_model = ""  # FIXME: For now, so user doesn't have to pass
     n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0
     device = 'cpu' if n_gpus == 0 else 'cuda'

diff --git a/docs/README_LangChain.md b/docs/README_LangChain.md
@@ -77,6 +77,8 @@ pip install -r reqs_optional/requirements_optional_langchain.gpllike.txt
 ```
 but pymupdf is AGPL, requiring any source code be made available, which is not an issue directly for h2oGPT, but it's like GPL and too strong a constraint for general commercial use.
 
+When pymupdf is installed, we will use `PyMuPDFLoader` by default to parse PDFs since better than `PyPDFLoader` and much better than `PDFMinerLoader`.  This can be overridden by setting `PDF_CLASS_NAME=PyPDFLoader` in `.env_gpt4all`.
+
 ## Database creation
 
 To use some example databases (will overwrite UserData make above unless change options) and run generate after, do:
@@ -98,6 +100,12 @@ python make_db.py --add_if_exists=True
 python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-512-6_9b --langchain_mode=UserData
 ```
 
+By default, `generate.py` will load an existing UserData database and add any documents added to user_path or change any files that have changed.  To avoid detecting any new files, just avoid passing --user_path=user_path, which sets it to None, i.e.:
+```bash
+python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-512-6_9b --langchain_mode=UserData
+```
+which will avoid using `user_path` since it is no longer passed.  Otherwise when passed, any new files will be added or changed (by hash) files will be updated (delete old sources and add new sources).
+
 ## Document Question-Answer FAQ
 
 #### Why does the source link not work?

diff --git a/eval.py b/eval.py
@@ -7,7 +7,7 @@
 from matplotlib import pyplot as plt
 
 from generate import eval_func_param_names, eval_extra_columns, get_context, get_score_model, get_model, evaluate, \
-    inputs_kwargs_list
+    inputs_kwargs_list, check_locals
 from prompter import Prompter
 from utils import clear_torch_cache, NullContext, get_kwargs
 
@@ -39,32 +39,11 @@ def run_eval(  # for local function:
         # for evaluate kwargs:
         src_lang=None, tgt_lang=None, concurrency_count=None, save_dir=None, sanitize_bot_response=None,
         model_state0=None, raise_generate_gpu_exceptions=None, load_db_if_exists=None, dbs=None, user_path=None,
+        detect_user_path_changes_every_query=None,
         use_openai_embedding=None, use_openai_model=None, hf_embedding_model=None, chunk=None, chunk_size=None,
         db_type=None, n_jobs=None, first_para=None, text_limit=None, verbose=None, cli=None,
 ):
-    # ensure everything in evaluate is here
-    can_skip_because_locally_generated = [  # evaluate
-        'instruction',
-        'iinput',
-        'context',
-        'instruction_nochat',
-        'iinput_nochat',
-        # get_model:
-        'reward_type'
-    ]
-    for k in eval_func_param_names:
-        if k in can_skip_because_locally_generated:
-            continue
-        assert k in locals(), "Missing %s" % k
-    for k in inputs_kwargs_list:
-        if k in can_skip_because_locally_generated:
-            continue
-        assert k in locals(), "Missing %s" % k
-
-    for k in list(inspect.signature(get_model).parameters):
-        if k in can_skip_because_locally_generated:
-            continue
-        assert k in locals(), "Missing %s" % k
+    check_locals(**locals())
 
     if eval_prompts_only_num > 0:
         np.random.seed(eval_prompts_only_seed)

diff --git a/generate.py b/generate.py
@@ -93,8 +93,8 @@ def main(
         stream_output: bool = True,
         show_examples: bool = None,
         verbose: bool = False,
-        h2ocolors: bool = True,
-        height: int = 400,
+        h2ocolors: bool = False,
+        height: int = 600,
         show_lora: bool = True,
         login_mode_if_model0: bool = False,
         block_gradio_exit: bool = True,
@@ -122,6 +122,7 @@ def main(
         visible_langchain_modes: list = ['UserData', 'MyData'],
         document_choice: list = ['All'],
         user_path: str = None,
+        detect_user_path_changes_every_query: bool = False,
         load_db_if_exists: bool = True,
         keep_sources_in_context: bool = False,
         db_type: str = 'chroma',
@@ -214,7 +215,10 @@ def main(
     :param eval_as_output: for no gradio benchmark, whether to test eval_filename output itself
     :param langchain_mode: Data source to include.  Choose "UserData" to only consume files from make_db.py.
            WARNING: wiki_full requires extra data processing via read_wiki_full.py and requires really good workstation to generate db, unless already present.
-    :param user_path: user path to glob from to generate db for vector search, for 'UserData' langchain mode
+    :param user_path: user path to glob from to generate db for vector search, for 'UserData' langchain mode.
+           If already have db, any new/changed files are added automatically if path set, does not have to be same path used for prior db sources
+    :param detect_user_path_changes_every_query: whether to detect if any files changed or added every similarity search (by file hashes).
+           Expensive for large number of files, so not done by default.  By default only detect changes during db loading.
     :param visible_langchain_modes: dbs to generate at launch to be ready for LLM
            Can be up to ['wiki', 'wiki_full', 'UserData', 'MyData', 'github h2oGPT', 'DriverlessAI docs']
            But wiki_full is expensive and requires preparation
@@ -231,7 +235,7 @@ def main(
     :param allow_upload_to_user_data: Whether to allow file uploads to update shared vector db
     :param allow_upload_to_my_data: Whether to allow file uploads to update scratch vector db
     :param enable_url_upload: Whether to allow upload from URL
-    :param enable_text_upload: Whether to allow uplaod of text
+    :param enable_text_upload: Whether to allow upload of text
     :param enable_sources_list: Whether to allow list (or download for non-shared db) of list of sources for chosen db
     :param chunk: Whether to chunk data (True unless know data is already optimally chunked)
     :param chunk_size: Size of chunks, with typically top-4 passed to LLM, so neesd to be in context length
@@ -379,7 +383,9 @@ def main(
                 # FIXME: All should be avoided until scans over each db, shouldn't be separate db
                 continue
             persist_directory1 = 'db_dir_%s' % langchain_mode1  # single place, no special names for each case
-            db = prep_langchain(persist_directory1, load_db_if_exists, db_type, use_openai_embedding,
+            db = prep_langchain(persist_directory1,
+                                load_db_if_exists,
+                                db_type, use_openai_embedding,
                                 langchain_mode1, user_path,
                                 hf_embedding_model,
                                 kwargs_make_db=locals())
@@ -843,6 +849,7 @@ def evaluate(
         load_db_if_exists=True,
         dbs=None,
         user_path=None,
+        detect_user_path_changes_every_query=None,
         use_openai_embedding=None,
         use_openai_model=None,
         hf_embedding_model=None,
@@ -938,6 +945,7 @@ def evaluate(
                            load_db_if_exists=load_db_if_exists,
                            db=db1,
                            user_path=user_path,
+                           detect_user_path_changes_every_query=detect_user_path_changes_every_query,
                            max_new_tokens=max_new_tokens,
                            cut_distanct=1.1 if langchain_mode in ['wiki_full'] else 1.64,  # FIXME, too arbitrary
                            use_openai_embedding=use_openai_embedding,
@@ -1473,6 +1481,32 @@ def score_qa(smodel, stokenizer, max_length_tokenize, question, answer, cutoff_l
     return score
 
 
+def check_locals(**kwargs):
+    # ensure everything in evaluate is here
+    can_skip_because_locally_generated = [  # evaluate
+        'instruction',
+        'iinput',
+        'context',
+        'instruction_nochat',
+        'iinput_nochat',
+        # get_model:
+        'reward_type'
+    ]
+    for k in eval_func_param_names:
+        if k in can_skip_because_locally_generated:
+            continue
+        assert k in kwargs, "Missing %s" % k
+    for k in inputs_kwargs_list:
+        if k in can_skip_because_locally_generated:
+            continue
+        assert k in kwargs, "Missing %s" % k
+
+    for k in list(inspect.signature(get_model).parameters):
+        if k in can_skip_because_locally_generated:
+            continue
+        assert k in kwargs, "Missing %s" % k
+
+
 if __name__ == "__main__":
     """
     Examples: