From 26d377c6a8ba973f986379430b46bf5cfa000071 Mon Sep 17 00:00:00 2001 From: kddubey Date: Tue, 7 Nov 2023 12:55:27 -0800 Subject: [PATCH] Little fixes --- src/cappr/_example.py | 21 ------------------- src/cappr/huggingface/classify.py | 9 ++++---- src/cappr/huggingface/classify_no_cache.py | 22 ++++++++++++++++---- src/cappr/llama_cpp/_utils.py | 5 ++--- src/cappr/llama_cpp/classify.py | 24 +++++++++++++++++----- src/cappr/openai/api.py | 6 +++--- src/cappr/utils/_check.py | 5 +++-- src/cappr/utils/_no_cache.py | 6 +++--- src/cappr/utils/classify.py | 11 +++++----- 9 files changed, 58 insertions(+), 51 deletions(-) diff --git a/src/cappr/_example.py b/src/cappr/_example.py index 51f0960..2b2fc57 100644 --- a/src/cappr/_example.py +++ b/src/cappr/_example.py @@ -32,27 +32,6 @@ class Example: probability distribution over completions. Set this to `False` if you'd like the raw completion-after-prompt probability, or you're solving a multi-label prediction problem. By default, True - - Raises - ------ - TypeError - if `prompt` is not a string - ValueError - if `prompt` is empty - TypeError - if `completions` is not a sequence - ValueError - if `completions` is empty, or contains an empty string - TypeError - if `end_of_prompt` is not a string - ValueError - if `end_of_prompt` is not a whitespace or empty - TypeError - if `prior` is not None, or it isn't a sequence or numpy array - ValueError - if `prior` is not a 1-D probability distribution over `completions` - ValueError - if `normalize` is True but there's only one completion in `completions` """ prompt: str diff --git a/src/cappr/huggingface/classify.py b/src/cappr/huggingface/classify.py index b1ff42f..0a18ad9 100644 --- a/src/cappr/huggingface/classify.py +++ b/src/cappr/huggingface/classify.py @@ -483,7 +483,7 @@ def cache( ######################################################################################## -############################## Logits (from cached model) ############################## +############################### Logits from cached model ############################### ######################################################################################## @@ -534,7 +534,7 @@ def _blessed_helper( ) num_batches = len(completions_input_ids) - # TODO: put this in the context manager? Little weird. + # TODO: put this in the context manager? Little weird if not hf._utils.does_tokenizer_need_prepended_space(tokenizer): start_of_prompt = "" else: @@ -890,9 +890,8 @@ def log_probs_conditional_examples( print(log_probs_completions[1]) # corresponds to examples[1] # [[-5.0, -1.7]] [[log Pr(1 | a, b, c)], log Pr(2 | a, b, c, 1)]] """ - # Little weird. I want my IDE to know that examples is always a Sequence[Example] - # b/c of the decorator. - examples: Sequence[Example] = examples + # examples is always a Sequence[Example] b/c of the decorator. + examples = cast(Sequence[Example], examples) @_batch.flatten @_batch.batchify( diff --git a/src/cappr/huggingface/classify_no_cache.py b/src/cappr/huggingface/classify_no_cache.py index b301aa1..afd710e 100644 --- a/src/cappr/huggingface/classify_no_cache.py +++ b/src/cappr/huggingface/classify_no_cache.py @@ -11,7 +11,7 @@ this module **does not** precompute attention block keys and values for prompts. """ from __future__ import annotations -from typing import Literal, Mapping, Sequence +from typing import cast, Literal, Mapping, Sequence import numpy as np import numpy.typing as npt @@ -88,6 +88,11 @@ def token_logprobs( return hf.classify.token_logprobs(**locals()) +######################################################################################## +######################################## Logits ######################################## +######################################################################################## + + def _prompts_offsets( tokenizer: PreTrainedTokenizerBase, prompts: Sequence[str], @@ -163,6 +168,11 @@ def _logits_completions_given_prompts_examples( return logits, encodings +######################################################################################## +################################## Logits to log-probs ################################# +######################################################################################## + + def _logits_to_log_probs_completions( logits: torch.Tensor, encodings: Mapping[str, torch.Tensor] ) -> list[list[float]]: @@ -179,6 +189,11 @@ def _logits_to_log_probs_completions( ] +######################################################################################## +##################################### Implementation ################################### +######################################################################################## + + @classify._log_probs_conditional def log_probs_conditional( prompts: str | Sequence[str], @@ -377,9 +392,8 @@ def log_probs_conditional_examples( print(log_probs_completions[1]) # corresponds to examples[1] # [[-5.0, -1.7]] [[log Pr(1 | a, b, c)], log Pr(2 | a, b, c, 1)]] """ - # Little weird. I want my IDE to know that examples is always a Sequence[Example] - # b/c of the decorator. - examples: Sequence[Example] = examples + # examples is always a Sequence[Example] b/c of the decorator. + examples = cast(Sequence[Example], examples) @_batch.flatten @_batch.batchify( diff --git a/src/cappr/llama_cpp/_utils.py b/src/cappr/llama_cpp/_utils.py index c5df504..9e8f0ec 100644 --- a/src/cappr/llama_cpp/_utils.py +++ b/src/cappr/llama_cpp/_utils.py @@ -29,9 +29,8 @@ def check_logits(logits) -> np.ndarray: logits = np.array(logits) if np.any(np.isnan(logits)): raise TypeError( - "There are nan logits. This can happen if the model is re-loaded too many " - "times in the same session. Please raise this as an issue so that I can " - "investigate: https://github.com/kddubey/cappr/issues" + "There are nan logits. Is there something wrong with the model? This can " + "happen if the model is reloaded many times in the same session." ) # pragma: no cover return logits diff --git a/src/cappr/llama_cpp/classify.py b/src/cappr/llama_cpp/classify.py index 32127ae..6cfed46 100644 --- a/src/cappr/llama_cpp/classify.py +++ b/src/cappr/llama_cpp/classify.py @@ -23,7 +23,7 @@ """ from __future__ import annotations from contextlib import contextmanager -from typing import Literal, Sequence +from typing import cast, Literal, Sequence from llama_cpp import Llama import numpy as np @@ -119,6 +119,11 @@ def token_logprobs( return log_probs +######################################################################################## +###################################### KV caching ###################################### +######################################################################################## + + @contextmanager def cache(model: Llama, prefix: str, reset_model: bool = True): """ @@ -191,6 +196,11 @@ def cache(model: Llama, prefix: str, reset_model: bool = True): model.n_tokens = n_tokens +######################################################################################## +############################## Logprobs from cached model ############################## +######################################################################################## + + def _log_probs_conditional_prompt( prompt: str, completions: Sequence[str], @@ -199,7 +209,7 @@ def _log_probs_conditional_prompt( ) -> list[list[float]]: _utils.check_model(model) # Prepend whitespaces if the tokenizer or context call for it - # TODO: put this in the context manager? Little weird. + # TODO: put this in the context manager? Little weird if not _utils.does_tokenizer_need_prepended_space(model): start_of_prompt = "" end_of_prompt = "" @@ -258,6 +268,11 @@ def _log_probs_conditional_prompt( return log_probs_completions +######################################################################################## +#################################### Implementation #################################### +######################################################################################## + + @classify._log_probs_conditional def log_probs_conditional( prompts: str | Sequence[str], @@ -441,9 +456,8 @@ def log_probs_conditional_examples( print(log_probs_completions[1]) # corresponds to examples[1] # [[-9.90, -10.0]] [[log Pr(d | a, b, c)], log Pr(e | a, b, c, d)]] """ - # Little weird. I want my IDE to know that examples is always a Sequence[Example] - # b/c of the decorator. - examples: Sequence[Example] = examples + # examples is always a Sequence[Example] b/c of the decorator. + examples = cast(Sequence[Example], examples) if reset_model: model.reset() log_probs_completions = [ diff --git a/src/cappr/openai/api.py b/src/cappr/openai/api.py index 69c07a6..591835e 100644 --- a/src/cappr/openai/api.py +++ b/src/cappr/openai/api.py @@ -15,7 +15,7 @@ try: from openai import OpenAI except ImportError: # pragma: no cover - # openai version < 1.0.0. Many breaking changes need handling + # openai < v1.0.0. Many breaking changes need handling OpenAI = type("OpenAI", (object,), {}) # pragma: no cover _ERRORS_MODULE = openai.error # pragma: no cover else: @@ -304,7 +304,7 @@ def gpt_complete( list with the same length as `texts`. Each element is the ``choices`` mapping """ _check.ordered(texts, variable_name="texts") - try: + try: # openai < v1.0.0 openai_method = openai.Completion.create # pragma: no cover except AttributeError: openai_method = ( @@ -400,7 +400,7 @@ def gpt_chat_complete( list with the same length as `texts`. Each element is the ``choices`` mapping """ _check.ordered(texts, variable_name="texts") - try: + try: # openai < v1.0.0 openai_method = openai.ChatCompletion.create # pragma: no cover except AttributeError: openai_method = ( diff --git a/src/cappr/utils/_check.py b/src/cappr/utils/_check.py index dbe064e..e4003ac 100644 --- a/src/cappr/utils/_check.py +++ b/src/cappr/utils/_check.py @@ -170,8 +170,9 @@ def remove_bos(tokens: list[int]) -> list[int]: tokens_concat_correct = tokenize("a") + remove_bos(tokenize(" b")) if tokens != tokens_concat_correct: raise ValueError( - "This tokenizer is weird. Please raise this as an issue so that I can " - "investigate: https://github.com/kddubey/cappr/issues" + "This tokenizer is weird. Perhaps it's adding EOS tokens? Please raise " + "this as an issue so that I can investigate: " + "https://github.com/kddubey/cappr/issues" ) # pragma: no cover return True return False diff --git a/src/cappr/utils/_no_cache.py b/src/cappr/utils/_no_cache.py index bd83da9..6c5f5db 100644 --- a/src/cappr/utils/_no_cache.py +++ b/src/cappr/utils/_no_cache.py @@ -2,7 +2,7 @@ Utilities for implementations which don't cache. """ from __future__ import annotations -from typing import Callable, Literal, Sequence +from typing import Callable, cast, Literal, Sequence from cappr.utils import _batch @@ -72,8 +72,8 @@ def log_probs_conditional_examples( ): from cappr import Example - # Little weird. I want my IDE to know that examples is always a Sequence[Example] - examples: Sequence[Example] = examples + # examples is always a Sequence[Example] b/c of the decorator. + examples = cast(Sequence[Example], examples) texts = [ example.prompt + example.end_of_prompt + completion diff --git a/src/cappr/utils/classify.py b/src/cappr/utils/classify.py index e3e0ccc..3e50a52 100644 --- a/src/cappr/utils/classify.py +++ b/src/cappr/utils/classify.py @@ -501,7 +501,7 @@ def _predict(predict_proba_func): @wraps(predict_proba_func) def wrapper( prompts: str | Sequence[str], completions: Sequence[str], *args, **kwargs - ) -> list[str]: + ) -> str | list[str]: if len(completions) == 1: raise ValueError( "completions only has one completion. predict will trivially return " @@ -511,9 +511,8 @@ def wrapper( pred_probs: npt.NDArray = predict_proba_func( prompts, completions, *args, **kwargs ) - if not isinstance(completions, Sequence): - # We need completions to support 0-indexed __getitem__ - completions = list(completions) + # We need completions to support 0-indexed __getitem__ + completions = list(completions) num_dimensions = pred_probs.ndim if isinstance(prompts, str): # User convenience: prompts was a single string, so pred_probs is 1-D @@ -535,7 +534,9 @@ def _predict_examples(predict_proba_examples_func): from cappr import Example @wraps(predict_proba_examples_func) - def wrapper(examples: Example | Sequence[Example], *args, **kwargs) -> list[str]: + def wrapper( + examples: Example | Sequence[Example], *args, **kwargs + ) -> str | list[str]: pred_probs: npt.NDArray[np.floating] | list[ npt.NDArray[np.floating] ] = predict_proba_examples_func(examples, *args, **kwargs)