Skip to content

Commit

Permalink
Merge pull request OpenInterpreter#87 from KillianLucas/rate-limit-fi…
Browse files Browse the repository at this point in the history
…x-and-smart-llama

Rate limit fix, dramatic CodeLlama improvements
  • Loading branch information
KillianLucas authored Sep 7, 2023
2 parents 724f700 + 5729819 commit 7867cc8
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 56 deletions.
164 changes: 119 additions & 45 deletions interpreter/interpreter.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def get_info_for_system_message(self):
current_working_directory = os.getcwd()
operating_system = platform.system()

info += f"\n\n[User Info]\nName: {username}\nCWD: {current_working_directory}\nOS: {operating_system}"
info += f"[User Info]\nName: {username}\nCWD: {current_working_directory}\nOS: {operating_system}"

if not self.local:

Expand Down Expand Up @@ -146,10 +146,8 @@ def get_info_for_system_message(self):
elif self.local:

# Tell Code-Llama how to run code.
info += "\n\nTo run code, simply write a fenced code block (i.e ```python or ```shell) in markdown. When you close it with ```, it will be run. You'll then be given its output."
info += "\n\nTo run code, write a fenced code block (i.e ```python or ```shell) in markdown. When you close it with ```, it will be run. You'll then be given its output."
# We make references in system_message.txt to the "function" it can call, "run_code".
# But functions are not supported by Code-Llama, so:
info = info.replace("run_code", "a markdown code block")

return info

Expand Down Expand Up @@ -352,9 +350,19 @@ def respond(self):
# Add relevant info to system_message
# (e.g. current working directory, username, os, etc.)
info = self.get_info_for_system_message()

# This is hacky, as we should have a different (minified) prompt for CodeLLama,
# but for now, to make the prompt shorter and remove "run_code" references, just get the first 2 lines:
if self.local:
self.system_message = "\n".join(self.system_message.split("\n")[:3])
self.system_message += "\nOnly do what the user asks you to do, then ask what they'd like to do next."

system_message = self.system_message + "\n\n" + info

messages = tt.trim(self.messages, self.model, system_message=system_message)
if self.local:
messages = tt.trim(self.messages, max_tokens=1048, system_message=system_message)
else:
messages = tt.trim(self.messages, self.model, system_message=system_message)

if self.debug_mode:
print("\n", "Sending `messages` to LLM:", "\n")
Expand All @@ -363,40 +371,92 @@ def respond(self):

# Make LLM call
if not self.local:
# gpt-4
if self.use_azure:
response = openai.ChatCompletion.create(
engine=self.azure_deployment_name,
messages=messages,
functions=[function_schema],
temperature=self.temperature,
stream=True,
)
# GPT

for _ in range(3): # 3 retries
try:

if self.use_azure:
response = openai.ChatCompletion.create(
engine=self.azure_deployment_name,
messages=messages,
functions=[function_schema],
temperature=self.temperature,
stream=True,
)
else:
response = openai.ChatCompletion.create(
model=self.model,
messages=messages,
functions=[function_schema],
stream=True,
temperature=self.temperature,
)

break
except openai.error.RateLimitError:
# Rate limit hit. Retrying in 3 seconds
time.sleep(3)
else:
response = openai.ChatCompletion.create(
model=self.model,
messages=messages,
functions=[function_schema],
stream=True,
temperature=self.temperature,
)
raise openai.error.RateLimitError("RateLimitError: Max retries reached")

elif self.local:
# Code-Llama

# Turn function messages -> system messages for llama compatability
messages = self.messages
for message in messages:
if message['role'] == 'function':
message['role'] = 'system'


response = self.llama_instance.create_chat_completion(
messages=messages,
# Convert messages to prompt
# (This only works if the first message is the only system message)

def messages_to_prompt(messages):
# Extracting the system prompt and initializing the formatted string with it.
system_prompt = messages[0]['content']
formatted_messages = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n"

# Loop starting from the first user message
for index, item in enumerate(messages[1:]):
role = item['role']
content = item['content']

if role == 'user':
formatted_messages += f"{content} [/INST] "
elif role == 'function':
formatted_messages += f"Output: {content} [/INST] "
elif role == 'assistant':
formatted_messages += f"{content} </s><s>[INST] "

# Remove the trailing '<s>[INST] ' from the final output
if formatted_messages.endswith("<s>[INST] "):
formatted_messages = formatted_messages[:-10]

return formatted_messages

prompt = messages_to_prompt(messages)
# Lmao i can't believe this works (it does need this btw)
if messages[-1]["role"] != "function":
prompt += "Let's explore this. By the way, I can run code on your machine by writing the code in a markdown code block. This works for shell, javascript, python, and applescript. I'm going to try to do this for your task. Anyway, "
elif messages[-1]["role"] == "function" and messages[-1]["content"] != "No output":
prompt += "Given the output of the code I just ran, "
elif messages[-1]["role"] == "function" and messages[-1]["content"] == "No output":
prompt += "Given the fact that the code I just ran produced no output, "


if self.debug_mode:
# we have to use builtins bizarrely! because rich.print interprets "[INST]" as something meaningful
import builtins
builtins.print("TEXT PROMPT SEND TO LLM:\n", prompt)

# Run Code-Llama

response = self.llama_instance(
prompt,
stream=True,
temperature=self.temperature,
stop=["</s>"]
)

# Initialize message, function call trackers, and active block
self.messages.append({})
self.messages.append({"role": "assistant"})
in_function_call = False
llama_function_call_finished = False
self.active_block = None
Expand All @@ -406,7 +466,13 @@ def respond(self):
# Azure OpenAI Service may return empty chunk
continue

delta = chunk["choices"][0]["delta"]
if self.local:
if "content" not in messages[-1]:
# This is the first chunk. We'll need to capitalize it, because our prompt ends in a ", "
chunk["choices"][0]["text"] = chunk["choices"][0]["text"].capitalize()
delta = {"content": chunk["choices"][0]["text"]}
else:
delta = chunk["choices"][0]["delta"]

# Accumulate deltas into the last message in messages
self.messages[-1] = merge_deltas(self.messages[-1], delta)
Expand Down Expand Up @@ -461,21 +527,29 @@ def respond(self):
# Code-Llama
# Parse current code block and save to parsed_arguments, under function_call
if "content" in self.messages[-1]:

# Split by "```" and get the last block
blocks = content.split("```")
if len(blocks) > 1:
current_code_block = blocks[-1]

lines = current_code_block.strip().split("\n")
language = lines[0].strip() if lines[0] else "python"

# Join all lines except for the language line
code = '\n'.join(lines[1:]).strip("` \n")

arguments = {"language": language, "code": code}
print(arguments)


content = self.messages[-1]["content"]

if "```" in content:
# Split by "```" to get the last open code block
blocks = content.split("```")

current_code_block = blocks[-1]

lines = current_code_block.split("\n")

if content.strip() == "```": # Hasn't outputted a language yet
language = None
else:
language = lines[0].strip() if lines[0] != "" else "python"

# Join all lines except for the language line
code = '\n'.join(lines[1:]).strip("` \n")

arguments = {"code": code}
if language: # We only add this if we have it-- the second we have it, an interpreter gets fired up (I think? maybe I'm wrong)
arguments["language"] = language

# Code-Llama won't make a "function_call" property for us to store this under, so:
if "function_call" not in self.messages[-1]:
self.messages[-1]["function_call"] = {}
Expand Down
20 changes: 10 additions & 10 deletions interpreter/llama_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,19 @@ def get_llama_2_instance():

models = {
'7B': {
'Low': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-7B-GGUF/resolve/main/codellama-7b.Q2_K.gguf', 'Size': '3.01 GB', 'RAM': '5.51 GB'},
'Medium': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-7B-GGUF/resolve/main/codellama-7b.Q4_K_M.gguf', 'Size': '4.24 GB', 'RAM': '6.74 GB'},
'High': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-7B-GGUF/resolve/main/codellama-7b.Q8_0.gguf', 'Size': '7.16 GB', 'RAM': '9.66 GB'}
'Low': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct-instruct.Q3_K_S.gguf', 'Size': '3.01 GB', 'RAM': '5.51 GB'},
'Medium': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct-instruct.Q4_K_M.gguf', 'Size': '4.24 GB', 'RAM': '6.74 GB'},
'High': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-Instruct-7B-GGUF/resolve/main/codellama-7b-instruct.Q8_0.gguf', 'Size': '7.16 GB', 'RAM': '9.66 GB'}
},
'13B': {
'Low': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-13B-GGUF/resolve/main/codellama-13b.Q2_K.gguf', 'Size': '5.66 GB', 'RAM': '8.16 GB'},
'Medium': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-13B-GGUF/resolve/main/codellama-13b.Q4_K_M.gguf', 'Size': '8.06 GB', 'RAM': '10.56 GB'},
'High': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-13B-GGUF/resolve/main/codellama-13b.Q8_0.gguf', 'Size': '13.83 GB', 'RAM': '16.33 GB'}
'Low': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-13B-Instruct-GGUF/resolve/main/codellama-13b-instruct.Q3_K_S.gguf', 'Size': '5.66 GB', 'RAM': '8.16 GB'},
'Medium': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-13B-Instruct-GGUF/resolve/main/codellama-13b-instruct.Q4_K_M.gguf', 'Size': '8.06 GB', 'RAM': '10.56 GB'},
'High': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-13B-Instruct-GGUF/resolve/main/codellama-13b-instruct.Q8_0.gguf', 'Size': '13.83 GB', 'RAM': '16.33 GB'}
},
'34B': {
'Low': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-34B-GGUF/resolve/main/codellama-34b.Q2_K.gguf', 'Size': '14.21 GB', 'RAM': '16.71 GB'},
'Medium': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-34B-GGUF/resolve/main/codellama-34b.Q4_K_M.gguf', 'Size': '20.22 GB', 'RAM': '22.72 GB'},
'High': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-34B-GGUF/resolve/main/codellama-34b.Q8_0.gguf', 'Size': '35.79 GB', 'RAM': '38.29 GB'}
'Low': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-34B-Instruct-GGUF/resolve/main/codellama-34b-instruct.Q3_K_S.gguf', 'Size': '14.21 GB', 'RAM': '16.71 GB'},
'Medium': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-34B-Instruct-GGUF/resolve/main/codellama-34b-instruct.Q4_K_M.gguf', 'Size': '20.22 GB', 'RAM': '22.72 GB'},
'High': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-34B-Instruct-GGUF/resolve/main/codellama-34b-instruct.Q8_0.gguf', 'Size': '35.79 GB', 'RAM': '38.29 GB'}
}
}

Expand Down Expand Up @@ -173,7 +173,7 @@ def supports_metal():
return None

# Initialize and return Code-Llama
llama_2 = Llama(model_path=model_path, n_gpu_layers=n_gpu_layers, verbose=False)
llama_2 = Llama(model_path=model_path, n_gpu_layers=n_gpu_layers, verbose=False, n_ctx=1048) # n_ctx = context window. smaller is faster

return llama_2

Expand Down
2 changes: 1 addition & 1 deletion interpreter/message_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def refresh(self, cursor=True):
if cursor:
content += "█"

markdown = Markdown(content)
markdown = Markdown(content.strip())
panel = Panel(markdown, box=MINIMAL)
self.live.update(panel)
self.live.refresh()
Expand Down

0 comments on commit 7867cc8

Please sign in to comment.