Add User input + max tokens requested exceeds model context window er…

…ror response (#1325) * added User input + max tokens requested exceeds model context window error response. Signed-off-by: Ye, Xinyu <xinyu.ye@intel.com>
intel · Feb 29, 2024 · ae91bf8 · ae91bf8
1 parent a7c15a9
commit ae91bf8
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 0 deletions.
diff --git a/intel_extension_for_transformers/neural_chat/errorcode.py b/intel_extension_for_transformers/neural_chat/errorcode.py
@@ -37,6 +37,7 @@ class ErrorCodes:
     ERROR_MODEL_NOT_SUPPORTED = 2006
     ERROR_HF_TOKEN_NOT_PROVIDED = 2007
     WARNING_INPUT_EXCEED_MAX_SEQ_LENGTH = 2101
+    WARNING_INPUT_COMPLETION_EXCEED_MAX_SEQ_LENGTH = 2102
 
     # General Service Error Code - Dataset related
     ERROR_DATASET_NOT_FOUND = 3001
@@ -83,6 +84,7 @@ class ErrorCodes:
         ERROR_INVALID_MODEL_VERSION: "Invalid model version",
         ERROR_MODEL_NOT_SUPPORTED: "Model is not supported",
         WARNING_INPUT_EXCEED_MAX_SEQ_LENGTH: "Input sequence exceeds maximum length",
+        WARNING_INPUT_COMPLETION_EXCEED_MAX_SEQ_LENGTH: "Input and completion sequence exceeds maximum length",
 
         ERROR_DATASET_NOT_FOUND: "Dataset was not found",
         ERROR_DATASET_CONFIG_NOT_FOUND: "Dataset configuration not found",

diff --git a/intel_extension_for_transformers/neural_chat/models/model_utils.py b/intel_extension_for_transformers/neural_chat/models/model_utils.py
@@ -1082,6 +1082,14 @@ def predict_stream(**params):
             )
             set_latest_error(ErrorCodes.WARNING_INPUT_EXCEED_MAX_SEQ_LENGTH)
             return
+        elif length < max_new_tokens:
+            logging.error(f"This model's maximum context length is {context_len} tokens. \
+                However, you requested {input_token_len+max_new_tokens} tokens ({input_token_len} \
+                in the messages, {max_new_tokens} in the completion). Please reduce the length \
+                of the messages or completion.",
+            )
+            set_latest_error(ErrorCodes.WARNING_INPUT_COMPLETION_EXCEED_MAX_SEQ_LENGTH)
+            return
 
     generate_kwargs = get_generate_kwargs(
         max_new_tokens, input_token_len,
@@ -1383,6 +1391,14 @@ def predict(**params):
             )
             set_latest_error(ErrorCodes.WARNING_INPUT_EXCEED_MAX_SEQ_LENGTH)
             return
+        elif length < max_new_tokens:
+            logging.error(f"This model's maximum context length is {context_len} tokens. \
+                However, you requested {input_token_len+max_new_tokens} tokens ({input_token_len} \
+                in the messages, {max_new_tokens} in the completion). Please reduce the length \
+                of the messages or completion.",
+            )
+            set_latest_error(ErrorCodes.WARNING_INPUT_COMPLETION_EXCEED_MAX_SEQ_LENGTH)
+            return
 
     if device in ["cpu", "cuda", "xpu"]:
         if device in ["cuda", "xpu"]: