feat: Add 'Improve Caption' functionality using Ollama

This commit introduces a new feature that allows users to improve their image captions using the Ollama language model. - Added an 'Improve Caption' button to the UI, which triggers an asynchronous background task to process the current caption and image using Ollama's llama3.2-vision model. - Implemented ImproveCaptionTask (QRunnable) to handle the Ollama API interaction and process the improvement request in a separate thread, preventing UI freezes. - Enhanced the UI with status updates during the improvement process, including disabling/enabling the button and displaying messages. - Improved error handling to provide more informative messages to the user in case of failures. - The improved caption is now appended to the text edit with a clear separator ('Improved Version:'), making it easy to compare the original and improved captions. - The prompt sent to Ollama is more explicit, instructing the model to 'improve or rewrite' the provided caption based on the image context. - Handles cases where the caption is empty or no image is selected.
healthonrails · Nov 9, 2024 · 9ba1e7a · 9ba1e7a
1 parent fc6d27e
commit 9ba1e7a
Showing 1 changed file with 87 additions and 0 deletions.
diff --git a/annolid/gui/widgets/caption.py b/annolid/gui/widgets/caption.py
@@ -98,11 +98,29 @@ def init_ui(self):
         read_button_layout.addWidget(
             self.read_label, alignment=Qt.AlignCenter)
 
+        # Add the improve caption button
+        self.improve_button = self.create_button(
+            icon_name="draw-arrow-forward",  # Example icon, adjust as needed
+            color="#ccccff",
+            hover_color="#9999ff"
+        )
+        self.improve_label = QLabel("Improve Caption")
+        self.improve_label.setAlignment(Qt.AlignCenter)
+        improve_button_layout = QVBoxLayout()
+        improve_button_layout.addWidget(
+            self.improve_button, alignment=Qt.AlignCenter)
+        improve_button_layout.addWidget(
+            self.improve_label, alignment=Qt.AlignCenter)
+
+        # Connect improve button
+        self.improve_button.clicked.connect(self.improve_caption_async)
+
         # Connect read button to the read_caption_async method
         self.read_button.clicked.connect(self.read_caption_async)
 
         # Add both button layouts to the horizontal layout
         button_layout.addLayout(record_button_layout)
+        button_layout.addLayout(improve_button_layout)
         button_layout.addLayout(describe_button_layout)
         button_layout.addLayout(read_button_layout)
         # (Add read button layout to the main button layout)
@@ -349,6 +367,75 @@ def record_voice(self):
                 """)
                 self.record_label.setText("Tap to record")
 
+    def improve_caption_async(self):
+        """Improves the caption using Ollama in a background thread."""
+        current_caption = self.text_edit.toPlainText()
+        if not current_caption:
+            print("Caption is empty. Nothing to improve.")
+            return
+
+        self.improve_label.setText("Improving...")
+        self.improve_button.setEnabled(False)
+
+        if self.image_path:
+            task = ImproveCaptionTask(self.image_path, current_caption, self)
+            self.thread_pool.start(task)
+        else:
+            self.update_improve_status(
+                "No image selected for caption improvement.", True)
+
+    @QtCore.Slot(str, bool)
+    def update_improve_status(self, message, is_error):
+        """Updates the improve caption status in the UI."""
+        if is_error:
+            self.improve_label.setText("Improvement failed.")
+        else:
+            # Append improved version
+            self.text_edit.append("\n\nImproved Version:\n" + message)
+            self.improve_label.setText("Improve Caption")
+
+        self.improve_button.setEnabled(True)
+
+
+class ImproveCaptionTask(QRunnable):
+    def __init__(self, image_path, current_caption, widget):
+        super().__init__()
+        self.image_path = image_path
+        self.current_caption = current_caption
+        self.widget = widget
+
+    def run(self):
+        try:
+            import ollama
+
+            prompt = f"Improve or rewrite the following caption, considering the image:\n\n{self.current_caption}"
+            response = ollama.chat(
+                model='llama3.2-vision',
+                messages=[{
+                    'role': 'user',
+                    'content': prompt,
+                    'images': [self.image_path]
+                }]
+            )
+
+            if "message" in response and "content" in response["message"]:
+                improved_caption = response["message"]["content"]
+                QMetaObject.invokeMethod(
+                    self.widget, "update_improve_status", Qt.QueuedConnection,
+                    QtCore.Q_ARG(str, improved_caption), QtCore.Q_ARG(
+                        bool, False)
+                )
+
+            else:
+                raise ValueError("Unexpected response format from Ollama.")
+
+        except Exception as e:
+            error_message = f"Error improving caption: {e}"
+            QMetaObject.invokeMethod(
+                self.widget, "update_improve_status", Qt.QueuedConnection,
+                QtCore.Q_ARG(str, error_message), QtCore.Q_ARG(bool, True)
+            )
+
 
 class DescribeImageTask(QRunnable):
     """A task to describe an image in the background."""