add dubbing agent with elevenlabs tools (#59)

* add dubbing agent & elevenlabs tools
video-db · Nov 13, 2024 · ad39c79 · ad39c79
1 parent d9e6b1f
commit ad39c79
Show file tree

Hide file tree

Showing 7 changed files with 266 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,4 +14,5 @@ venv
 *.egg-info
 package-lock.json
 *.mjs
-site/*
+site/*
+backend/director/downloads
diff --git a/backend/.env.sample b/backend/.env.sample
@@ -18,4 +18,7 @@ BEEP_AUDIO_ID=
 
 # Slack Agent
 SLACK_CHANNEL_NAME=
-SLACK_BOT_TOKEN=
+SLACK_BOT_TOKEN=
+
+# Dubbing AGENT
+ELEVENLABS_API_KEY=
diff --git a/backend/director/agents/dubbing.py b/backend/director/agents/dubbing.py
@@ -0,0 +1,172 @@
+import logging
+import os
+
+from director.constants import DOWNLOADS_PATH
+
+from director.agents.base import BaseAgent, AgentResponse, AgentStatus
+from director.core.session import Session, VideoContent, MsgStatus, VideoData
+from director.tools.videodb_tool import VideoDBTool
+from director.tools.elevenlabs import ElevenLabsTool
+
+logger = logging.getLogger(__name__)
+
+SUPPORTED_ENGINES = ["elevenlabs"]
+DUBBING_AGENT_PARAMETERS = {
+    "type": "object",
+    "properties": {
+        "video_id": {
+            "type": "string",
+            "description": "The unique identifier of the video that needs to be dubbed. This ID is used to retrieve the video from the VideoDB collection.",
+        },
+        "target_language": {
+            "type": "string",
+            "description": "The target language for dubbing (e.g. 'Spanish', 'French', 'German'). The video's audio will be translated and dubbed into this language.",
+        },
+        "target_language_code": {
+            "type": "string",
+            "description": "The target language code for dubbing (e.g. 'es' for Spanish, 'fr' for French, 'de' for German').",
+        },
+        "collection_id": {
+            "type": "string",
+            "description": "The unique identifier of the VideoDB collection containing the video. Required to locate and access the correct video library.",
+        },
+        "engine": {
+            "type": "string",
+            "description": "The dubbing engine to use. Default is 'elevenlabs'. Possible values include 'elevenlabs'.",
+            "default": "elevenlabs",
+        },
+        "engine_params": {
+            "type": "object",
+            "description": "Optional parameters for the dubbing engine.",
+        },
+    },
+    "required": [
+        "video_id",
+        "target_language",
+        "target_language_code",
+        "collection_id",
+        "engine",
+    ],
+}
+
+
+class DubbingAgent(BaseAgent):
+    def __init__(self, session: Session, **kwargs):
+        self.agent_name = "dubbing"
+        self.description = (
+            "This is an agent to dub the given video into a target language"
+        )
+        self.parameters = DUBBING_AGENT_PARAMETERS
+        super().__init__(session=session, **kwargs)
+
+    def run(
+        self,
+        video_id: str,
+        target_language: str,
+        target_language_code: str,
+        collection_id: str,
+        engine: str,
+        engine_params: dict = {},
+        *args,
+        **kwargs,
+    ) -> AgentResponse:
+        """
+        Process the video dubbing based on the given video ID.
+        :param str video_id: The ID of the video to process.
+        :param str target_language: The target language name for dubbing (e.g. Spanish).
+        :param str target_language_code: The target language code for dubbing (e.g. es).
+        :param str collection_id: The ID of the collection to process.
+        :param str engine: The dubbing engine to use. Default is 'elevenlabs'.
+        :param dict engine_params: Optional parameters for the dubbing engine.
+        :param args: Additional positional arguments.
+        :param kwargs: Additional keyword arguments.
+        :return: The response containing information about the dubbing operation.
+        :rtype: AgentResponse
+        """
+        try:
+            self.videodb_tool = VideoDBTool(collection_id=collection_id)
+
+            # Get video audio file
+            video = self.videodb_tool.get_video(video_id)
+            if not video:
+                raise Exception(f"Video {video_id} not found")
+
+            if engine not in SUPPORTED_ENGINES:
+                raise Exception(f"{engine} not supported")
+
+            video_content = VideoContent(
+                agent_name=self.agent_name,
+                status=MsgStatus.progress,
+                status_message="Processing...",
+            )
+            self.output_message.content.append(video_content)
+            self.output_message.actions.append("Downloading video")
+            self.output_message.push_update()
+
+            download_response = self.videodb_tool.download(video["stream_url"])
+
+            os.makedirs(DOWNLOADS_PATH, exist_ok=True)
+            dubbed_file_path = f"{DOWNLOADS_PATH}/{video_id}_dubbed.mp4"
+
+            if engine == "elevenlabs":
+                ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
+                if not ELEVENLABS_API_KEY:
+                    raise Exception("Elevenlabs API key not present in .env")
+                elevenlabs_tool = ElevenLabsTool(api_key=ELEVENLABS_API_KEY)
+                job_id = elevenlabs_tool.create_dub_job(
+                    source_url=download_response["download_url"],
+                    target_language=target_language_code,
+                )
+                self.output_message.actions.append(
+                    f"Dubbing job initiated with Job ID: {job_id}"
+                )
+                self.output_message.push_update()
+
+                self.output_message.actions.append(
+                    "Waiting for dubbing process to complete.."
+                )
+                self.output_message.push_update()
+                elevenlabs_tool.wait_for_dub_job(job_id)
+
+                self.output_message.actions.append("Downloading dubbed video")
+                self.output_message.push_update()
+                elevenlabs_tool.download_dub_file(
+                    job_id,
+                    target_language_code,
+                    dubbed_file_path,
+                )
+
+                self.output_message.actions.append(
+                    f"Uploading dubbed video to VideoDB as '[Dubbed in {target_language}] {video['name']}'"
+                )
+                self.output_message.push_update()
+
+            dubbed_video = self.videodb_tool.upload(
+                dubbed_file_path,
+                source_type="file_path",
+                media_type="video",
+                name=f"[Dubbed in {target_language}] {video['name']}",
+            )
+
+            video_content.video = VideoData(stream_url=dubbed_video["stream_url"])
+            video_content.status = MsgStatus.success
+            video_content.status_message = f"Dubbed video in {target_language} has been successfully added to your video. Here is your stream."
+            self.output_message.publish()
+
+            return AgentResponse(
+                status=AgentStatus.SUCCESS,
+                message=f"Successfully dubbed video '{video['name']}' to {target_language}",
+                data={
+                    "stream_url": dubbed_video["stream_url"],
+                    "video_id": dubbed_video["id"],
+                },
+            )
+
+        except Exception as e:
+            video_content.status = MsgStatus.error
+            video_content.status_message = "An error occurred while dubbing the video."
+            self.output_message.publish()
+            logger.exception(f"Error in {self.agent_name} agent: {str(e)}")
+            return AgentResponse(
+                status=AgentStatus.ERROR, message=f"Failed to dub video: {str(e)}"
+            )
diff --git a/backend/director/constants.py b/backend/director/constants.py
@@ -26,3 +26,5 @@ class EnvPrefix(str, Enum):
 
     OPENAI_ = "OPENAI_"
     ANTHROPIC_ = "ANTHROPIC_"
+
+DOWNLOADS_PATH="director/downloads"
diff --git a/backend/director/handler.py b/backend/director/handler.py
@@ -17,6 +17,7 @@
 from director.agents.stream_video import StreamVideoAgent
 from director.agents.subtitle import SubtitleAgent
 from director.agents.slack_agent import SlackAgent
+from director.agents.dubbing import DubbingAgent
 
 
 from director.core.session import Session, InputMessage, MsgStatus
@@ -48,6 +49,7 @@ def __init__(self, db, **kwargs):
             StreamVideoAgent,
             SubtitleAgent,
             SlackAgent,
+            DubbingAgent,
         ]
 
     def add_videodb_state(self, session):

diff --git a/backend/director/tools/elevenlabs.py b/backend/director/tools/elevenlabs.py
@@ -0,0 +1,83 @@
+import os
+import base64
+import json
+import time
+from typing import Optional
+from elevenlabs.client import ElevenLabs
+import traceback
+from elevenlabs import VoiceSettings
+
+
+class ElevenLabsTool:
+    def __init__(self, api_key: str):
+        if api_key:
+            self.client = ElevenLabs(api_key=api_key)
+        self.voice_settings = VoiceSettings(
+            stability=0.0, similarity_boost=1.0, style=0.0, use_speaker_boost=True
+        )
+
+    def create_dub_job(
+        self,
+        source_url: str,
+        target_language: str,
+    ) -> Optional[str]:
+        """
+        Dub an audio or video file from one language to another.
+
+        Args:
+            input_file_path: Path to input file
+            file_format: Format of input file (e.g. "audio/mpeg")
+            source_language: Source language code (e.g. "en")
+            target_language: Target language code (e.g. "es")
+
+        Returns:
+            Path to dubbed file if successful, None if failed
+        """
+        try:
+            response = self.client.dubbing.dub_a_video_or_an_audio_file(
+                source_url=source_url,
+                target_lang=target_language,
+            )
+
+            dubbing_id = response.dubbing_id
+            return dubbing_id
+
+        except Exception as e:
+            return {"error": str(e)}
+
+    def wait_for_dub_job(self, dubbing_id: str) -> bool:
+        """Wait for dubbing to complete."""
+        MAX_ATTEMPTS = 120
+        CHECK_INTERVAL = 30  # In seconds
+
+        for _ in range(MAX_ATTEMPTS):
+            try:
+                metadata = self.client.dubbing.get_dubbing_project_metadata(dubbing_id)
+                print("this is metadata", metadata)
+                if metadata.status == "dubbed":
+                    return True
+                elif metadata.status == "dubbing":
+                    time.sleep(CHECK_INTERVAL)
+                else:
+                    return False
+            except Exception as e:
+                print(traceback.format_exc())
+                print(f"Error checking dubbing status: {str(e)}")
+                return False
+        return False
+
+    def download_dub_file(
+        self, dubbing_id: str, language_code: str, output_path: str
+    ) -> Optional[str]:
+        """Download the dubbed file."""
+        try:
+            with open(output_path, "wb") as file:
+                for chunk in self.client.dubbing.get_dubbed_file(
+                    dubbing_id, language_code
+                ):
+                    file.write(chunk)
+            return output_path
+        except Exception as e:
+            print(traceback.format_exc())
+            print(f"Error downloading dubbed file: {str(e)}")
+            return None
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -1,5 +1,6 @@
 -e .
 anthropic==0.37.1
+elevenlabs==1.9.0
 Flask==3.0.3
 Flask-SocketIO==5.3.6
 Flask-Cors==4.0.1
Original file line number	Diff line number	Diff line change
Expand Up		@@ -26,3 +26,5 @@ class EnvPrefix(str, Enum):

		OPENAI_ = "OPENAI_"
		ANTHROPIC_ = "ANTHROPIC_"

		DOWNLOADS_PATH="director/downloads"