Merge pull request OpenInterpreter#87 from KillianLucas/rate-limit-fi…

…x-and-smart-llama Rate limit fix, dramatic CodeLlama improvements
joshuavial · Sep 7, 2023 · 7867cc8 · 7867cc8
2 parents 724f700 + 5729819
commit 7867cc8
Show file tree

Hide file tree

Showing 3 changed files with 130 additions and 56 deletions.
diff --git a/interpreter/interpreter.py b/interpreter/interpreter.py
@@ -115,7 +115,7 @@ def get_info_for_system_message(self):
     current_working_directory = os.getcwd()
     operating_system = platform.system()
 
-    info += f"\n\n[User Info]\nName: {username}\nCWD: {current_working_directory}\nOS: {operating_system}"
+    info += f"[User Info]\nName: {username}\nCWD: {current_working_directory}\nOS: {operating_system}"
 
     if not self.local:
 
@@ -146,10 +146,8 @@ def get_info_for_system_message(self):
     elif self.local:
 
       # Tell Code-Llama how to run code.
-      info += "\n\nTo run code, simply write a fenced code block (i.e ```python or ```shell) in markdown. When you close it with ```, it will be run. You'll then be given its output."
+      info += "\n\nTo run code, write a fenced code block (i.e ```python or ```shell) in markdown. When you close it with ```, it will be run. You'll then be given its output."
       # We make references in system_message.txt to the "function" it can call, "run_code".
-      # But functions are not supported by Code-Llama, so:
-      info = info.replace("run_code", "a markdown code block")
 
     return info
 
@@ -352,9 +350,19 @@ def respond(self):
     # Add relevant info to system_message
     # (e.g. current working directory, username, os, etc.)
     info = self.get_info_for_system_message()
+
+    # This is hacky, as we should have a different (minified) prompt for CodeLLama,
+    # but for now, to make the prompt shorter and remove "run_code" references, just get the first 2 lines:
+    if self.local:
+      self.system_message = "\n".join(self.system_message.split("\n")[:3])
+      self.system_message += "\nOnly do what the user asks you to do, then ask what they'd like to do next."
+
     system_message = self.system_message + "\n\n" + info
 
-    messages = tt.trim(self.messages, self.model, system_message=system_message)
+    if self.local:
+      messages = tt.trim(self.messages, max_tokens=1048, system_message=system_message)
+    else:
+      messages = tt.trim(self.messages, self.model, system_message=system_message)
 
     if self.debug_mode:
       print("\n", "Sending `messages` to LLM:", "\n")
@@ -363,40 +371,92 @@ def respond(self):
 
     # Make LLM call
     if not self.local:
-      # gpt-4
-      if self.use_azure:
-        response = openai.ChatCompletion.create(
-            engine=self.azure_deployment_name,
-            messages=messages,
-            functions=[function_schema],
-            temperature=self.temperature,
-            stream=True,
-            )
+      # GPT
+
+      for _ in range(3):  # 3 retries
+        try:
+
+            if self.use_azure:
+              response = openai.ChatCompletion.create(
+                  engine=self.azure_deployment_name,
+                  messages=messages,
+                  functions=[function_schema],
+                  temperature=self.temperature,
+                  stream=True,
+                  )
+            else:
+              response = openai.ChatCompletion.create(
+                model=self.model,
+                messages=messages,
+                functions=[function_schema],
+                stream=True,
+                temperature=self.temperature,
+              )
+
+            break
+        except openai.error.RateLimitError:
+            # Rate limit hit. Retrying in 3 seconds
+            time.sleep(3)
       else:
-        response = openai.ChatCompletion.create(
-          model=self.model,
-          messages=messages,
-          functions=[function_schema],
-          stream=True,
-          temperature=self.temperature,
-        )
+        raise openai.error.RateLimitError("RateLimitError: Max retries reached")
+
     elif self.local:
       # Code-Llama
 
-      # Turn function messages -> system messages for llama compatability
-      messages = self.messages
-      for message in messages:
-        if message['role'] == 'function':
-            message['role'] = 'system'
+
 
-      response = self.llama_instance.create_chat_completion(
-        messages=messages,
+      # Convert messages to prompt
+      # (This only works if the first message is the only system message)
+
+      def messages_to_prompt(messages):
+        # Extracting the system prompt and initializing the formatted string with it.
+        system_prompt = messages[0]['content']
+        formatted_messages = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n"
+
+        # Loop starting from the first user message
+        for index, item in enumerate(messages[1:]):
+            role = item['role']
+            content = item['content']
+
+            if role == 'user':
+                formatted_messages += f"{content} [/INST] "
+            elif role == 'function':
+                formatted_messages += f"Output: {content} [/INST] "
+            elif role == 'assistant':
+                formatted_messages += f"{content} </s><s>[INST] "
+
+        # Remove the trailing '<s>[INST] ' from the final output
+        if formatted_messages.endswith("<s>[INST] "):
+            formatted_messages = formatted_messages[:-10]
+
+        return formatted_messages
+
+      prompt = messages_to_prompt(messages)
+      # Lmao i can't believe this works (it does need this btw)
+      if messages[-1]["role"] != "function":
+        prompt += "Let's explore this. By the way, I can run code on your machine by writing the code in a markdown code block. This works for shell, javascript, python, and applescript. I'm going to try to do this for your task. Anyway, "
+      elif messages[-1]["role"] == "function" and messages[-1]["content"] != "No output":
+        prompt += "Given the output of the code I just ran, "
+      elif messages[-1]["role"] == "function" and messages[-1]["content"] == "No output":
+        prompt += "Given the fact that the code I just ran produced no output, "
+
+
+      if self.debug_mode:
+        # we have to use builtins bizarrely! because rich.print interprets "[INST]" as something meaningful
+        import builtins
+        builtins.print("TEXT PROMPT SEND TO LLM:\n", prompt)
+
+      # Run Code-Llama
+
+      response = self.llama_instance(
+        prompt,
         stream=True,
         temperature=self.temperature,
+        stop=["</s>"]
       )
 
     # Initialize message, function call trackers, and active block
-    self.messages.append({})
+    self.messages.append({"role": "assistant"})
     in_function_call = False
     llama_function_call_finished = False
     self.active_block = None
@@ -406,7 +466,13 @@ def respond(self):
         # Azure OpenAI Service may return empty chunk
         continue
 
-      delta = chunk["choices"][0]["delta"]
+      if self.local:
+        if "content" not in messages[-1]:
+          # This is the first chunk. We'll need to capitalize it, because our prompt ends in a ", "
+          chunk["choices"][0]["text"] = chunk["choices"][0]["text"].capitalize()
+        delta = {"content": chunk["choices"][0]["text"]}
+      else:
+        delta = chunk["choices"][0]["delta"]
 
       # Accumulate deltas into the last message in messages
       self.messages[-1] = merge_deltas(self.messages[-1], delta)
@@ -461,21 +527,29 @@ def respond(self):
           # Code-Llama
           # Parse current code block and save to parsed_arguments, under function_call
           if "content" in self.messages[-1]:
-
-            # Split by "```" and get the last block
-            blocks = content.split("```")
-            if len(blocks) > 1:
-                current_code_block = blocks[-1]
-
-                lines = current_code_block.strip().split("\n")
-                language = lines[0].strip() if lines[0] else "python"
-
-                # Join all lines except for the language line
-                code = '\n'.join(lines[1:]).strip("` \n")
-
-                arguments = {"language": language, "code": code}
-                print(arguments)
-
+
+            content = self.messages[-1]["content"]
+
+            if "```" in content:
+              # Split by "```" to get the last open code block
+              blocks = content.split("```")
+
+              current_code_block = blocks[-1]
+
+              lines = current_code_block.split("\n")
+
+              if content.strip() == "```": # Hasn't outputted a language yet
+                language = None
+              else:
+                language = lines[0].strip() if lines[0] != "" else "python"
+
+              # Join all lines except for the language line
+              code = '\n'.join(lines[1:]).strip("` \n")
+
+              arguments = {"code": code}
+              if language: # We only add this if we have it-- the second we have it, an interpreter gets fired up (I think? maybe I'm wrong)
+                arguments["language"] = language
+
             # Code-Llama won't make a "function_call" property for us to store this under, so:
             if "function_call" not in self.messages[-1]:
               self.messages[-1]["function_call"] = {}

diff --git a/interpreter/llama_2.py b/interpreter/llama_2.py
@@ -16,19 +16,19 @@ def get_llama_2_instance():
 
     models = {
         '7B': {
-            'Low': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-7B-GGUF/resolve/main/codellama-7b.Q2_K.gguf', 'Size': '3.01 GB', 'RAM': '5.51 GB'},
-            'Medium': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-7B-GGUF/resolve/main/codellama-7b.Q4_K_M.gguf', 'Size': '4.24 GB', 'RAM': '6.74 GB'},
-            'High': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-7B-GGUF/resolve/main/codellama-7b.Q8_0.gguf', 'Size': '7.16 GB', 'RAM': '9.66 GB'}
+            'Low': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct-instruct.Q3_K_S.gguf', 'Size': '3.01 GB', 'RAM': '5.51 GB'},
+            'Medium': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct-instruct.Q4_K_M.gguf', 'Size': '4.24 GB', 'RAM': '6.74 GB'},
+            'High': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-Instruct-7B-GGUF/resolve/main/codellama-7b-instruct.Q8_0.gguf', 'Size': '7.16 GB', 'RAM': '9.66 GB'}
         },
         '13B': {
-            'Low': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-13B-GGUF/resolve/main/codellama-13b.Q2_K.gguf', 'Size': '5.66 GB', 'RAM': '8.16 GB'},
-            'Medium': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-13B-GGUF/resolve/main/codellama-13b.Q4_K_M.gguf', 'Size': '8.06 GB', 'RAM': '10.56 GB'},
-            'High': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-13B-GGUF/resolve/main/codellama-13b.Q8_0.gguf', 'Size': '13.83 GB', 'RAM': '16.33 GB'}
+            'Low': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-13B-Instruct-GGUF/resolve/main/codellama-13b-instruct.Q3_K_S.gguf', 'Size': '5.66 GB', 'RAM': '8.16 GB'},
+            'Medium': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-13B-Instruct-GGUF/resolve/main/codellama-13b-instruct.Q4_K_M.gguf', 'Size': '8.06 GB', 'RAM': '10.56 GB'},
+            'High': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-13B-Instruct-GGUF/resolve/main/codellama-13b-instruct.Q8_0.gguf', 'Size': '13.83 GB', 'RAM': '16.33 GB'}
         },
         '34B': {
-            'Low': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-34B-GGUF/resolve/main/codellama-34b.Q2_K.gguf', 'Size': '14.21 GB', 'RAM': '16.71 GB'},
-            'Medium': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-34B-GGUF/resolve/main/codellama-34b.Q4_K_M.gguf', 'Size': '20.22 GB', 'RAM': '22.72 GB'},
-            'High': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-34B-GGUF/resolve/main/codellama-34b.Q8_0.gguf', 'Size': '35.79 GB', 'RAM': '38.29 GB'}
+            'Low': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-34B-Instruct-GGUF/resolve/main/codellama-34b-instruct.Q3_K_S.gguf', 'Size': '14.21 GB', 'RAM': '16.71 GB'},
+            'Medium': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-34B-Instruct-GGUF/resolve/main/codellama-34b-instruct.Q4_K_M.gguf', 'Size': '20.22 GB', 'RAM': '22.72 GB'},
+            'High': {'URL': 'https://huggingface.co/TheBloke/CodeLlama-34B-Instruct-GGUF/resolve/main/codellama-34b-instruct.Q8_0.gguf', 'Size': '35.79 GB', 'RAM': '38.29 GB'}
         }
     }
 
@@ -173,7 +173,7 @@ def supports_metal():
             return None
 
     # Initialize and return Code-Llama
-    llama_2 = Llama(model_path=model_path, n_gpu_layers=n_gpu_layers, verbose=False)
+    llama_2 = Llama(model_path=model_path, n_gpu_layers=n_gpu_layers, verbose=False, n_ctx=1048) # n_ctx = context window. smaller is faster
 
     return llama_2
 

diff --git a/interpreter/message_block.py b/interpreter/message_block.py
@@ -30,7 +30,7 @@ def refresh(self, cursor=True):
     if cursor:
       content += "█"
 
-    markdown = Markdown(content)
+    markdown = Markdown(content.strip())
     panel = Panel(markdown, box=MINIMAL)
     self.live.update(panel)
     self.live.refresh()