Add the transcribe endpoint to the client (#314)

* add TranscribeJob + start_transcription for the api * add fct to check if youtube link * add YoutubeSource * add start_transcription method * fix + tests * fix quality
Wordcab · Sep 25, 2023 · b94e727 · b94e727
1 parent 6539c6a
commit b94e727
Show file tree

Hide file tree

Showing 14 changed files with 566 additions and 33 deletions.
diff --git a/src/wordcab/api.py b/src/wordcab/api.py
@@ -18,6 +18,7 @@
 
 from .client import Client
 from .core_objects import (
+    AudioSource,
     BaseSource,
     BaseSummary,
     BaseTranscript,
@@ -28,7 +29,9 @@
     ListTranscripts,
     Stats,
     SummarizeJob,
+    TranscribeJob,
     WordcabTranscriptSource,
+    YoutubeSource,
 )
 
 
@@ -231,6 +234,60 @@ def start_summary(
     )
 
 
+@no_type_check
+def start_transcription(
+    source_object: Union[AudioSource, YoutubeSource],
+    display_name: str,
+    source_lang: str,
+    diarization: bool = False,
+    ephemeral_data: bool = False,
+    only_api: bool = True,
+    tags: Union[str, List[str], None] = None,
+    api_key: Union[str, None] = None,
+) -> TranscribeJob:
+    """
+    Start a transcription job.
+
+    Parameters
+    ----------
+    source_object : AudioSource
+        The source object to transcribe.
+    display_name : str
+        The display name of the transcription. This is useful for retrieving the job later.
+    source_lang : str
+        The language of the source audio.
+    diarization : bool
+        Whether to perform speaker diarization. The default is False.
+    ephemeral_data : bool
+        Whether to delete the data after the transcription is complete. The default is False. If False, the data will be
+        kept on Wordcab's servers. You can delete the data at any time, check the documentation here:
+        https://docs.wordcab.com/docs/enabling-ephemeral-data
+    only_api : bool
+        Whether to only use the API to transcribe the audio. The default is True.
+    tags : str or list of str, optional
+        The tags to add to the job. The default is None. If None, no tags will be added.
+    api_key : str, optional
+        The API key to use. The default is None. If None, the API key will be
+        automatically retrieved from the environment variable WORDCAB_API_KEY.
+
+    Returns
+    -------
+    TranscribeJob
+        The transcribe job object.
+    """
+    return request(
+        method="start_transcription",
+        source_object=source_object,
+        display_name=display_name,
+        source_lang=source_lang,
+        diarization=diarization,
+        ephemeral_data=ephemeral_data,
+        only_api=only_api,
+        tags=tags,
+        api_key=api_key,
+    )
+
+
 @no_type_check
 def list_jobs(
     page_size: int = 100,

diff --git a/src/wordcab/client.py b/src/wordcab/client.py
@@ -30,8 +30,10 @@
     SUMMARY_PIPELINES,
     SUMMARY_TYPES,
     TARGET_LANG,
+    TRANSCRIBE_LANGUAGE_CODES,
 )
 from .core_objects import (
+    AudioSource,
     BaseSource,
     BaseSummary,
     BaseTranscript,
@@ -44,8 +46,10 @@
     Stats,
     StructuredSummary,
     SummarizeJob,
+    TranscribeJob,
     TranscriptUtterance,
     WordcabTranscriptSource,
+    YoutubeSource,
 )
 from .login import get_token
 from .utils import (
@@ -418,6 +422,78 @@ def start_summary(  # noqa: C901
         else:
             raise ValueError(r.text)
 
+    def start_transcription(
+        self,
+        source_object: Union[AudioSource, YoutubeSource],
+        display_name: str,
+        source_lang: str,
+        diarization: bool = False,
+        ephemeral_data: bool = False,
+        only_api: Optional[bool] = True,
+        tags: Union[str, List[str], None] = None,
+        api_key: Union[str, None] = None,
+    ) -> TranscribeJob:
+        """Start a transcription job."""
+        if source_lang not in TRANSCRIBE_LANGUAGE_CODES:
+            raise ValueError(f"""
+                Invalid source language: {source_lang}. Source language must be one of {TRANSCRIBE_LANGUAGE_CODES}.
+            """)
+
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+        }
+
+        params = {
+            "display_name": display_name,
+            "source_lang": source_lang,
+            "diarization": str(diarization).lower(),
+            "ephemeral_data": str(ephemeral_data).lower(),
+        }
+        if tags:
+            params["tags"] = _format_tags(tags)
+
+        if isinstance(source_object, AudioSource):
+            _data = source_object.file_object
+
+            if source_object.file_object is None:  # URL source
+                params["url_type"] = "audio_url"
+                params["url"] = source_object.url
+            else:  # File object source
+                headers["Content-Disposition"] = (
+                    f'attachment; filename="{source_object._stem}"'
+                )
+                headers["Content-Type"] = f"audio/{source_object._suffix}"
+
+        else:  # Youtube source
+            params["url_type"] = source_object.source_type
+            params["url"] = source_object.url
+            _data = None
+
+        r = requests.post(
+            "https://wordcab.com/api/v1/transcribe",
+            headers=headers,
+            params=params,
+            data=_data,
+            timeout=self.timeout,
+        )
+
+        if r.status_code == 200 or r.status_code == 201:
+            logger.info("Transcription job started.")
+            return TranscribeJob(
+                display_name=display_name,
+                job_name=r.json()["job_name"],
+                source=source_object.source,
+                source_lang=source_lang,
+                settings=JobSettings(
+                    pipeline="transcribe",
+                    ephemeral_data=ephemeral_data,
+                    only_api=only_api,
+                    split_long_utterances=False,
+                ),
+            )
+        else:
+            raise ValueError(r.text)
+
     def list_jobs(
         self,
         page_size: Optional[int] = 100,

diff --git a/src/wordcab/config.py b/src/wordcab/config.py
@@ -47,11 +47,11 @@
     "generic": "GenericSource",
     "audio": "AudioSource",
     "wordcab_transcript": "WordcabTranscriptSource",
-    "signed_url": "SignedUrlSource",
     "assembly_ai": "AssemblyAISource",
     "deepgram": "DeepgramSource",
     "rev_ai": "RevSource",
     "vtt": "VTTSource",
+    "youtube": "YoutubeSource",
 }
 SUMMARIZE_AVAILABLE_STATUS = [
     "Deleted",
@@ -69,6 +69,107 @@
 SUMMARY_PIPELINES = ["transcribe", "summarize"]
 SUMMARY_TYPES = ["brief", "conversational", "narrative", "no_speaker"]
 TARGET_LANG = ["de", "en", "es", "fr", "it", "pt", "sv"]
+TRANSCRIBE_LANGUAGE_CODES = [
+    "af",
+    "am",
+    "ar",
+    "as",
+    "az",
+    "ba",
+    "be",
+    "bg",
+    "bn",
+    "bo",
+    "br",
+    "bs",
+    "ca",
+    "cs",
+    "cy",
+    "da",
+    "de",
+    "el",
+    "en",
+    "es",
+    "et",
+    "eu",
+    "fa",
+    "fi",
+    "fo",
+    "fr",
+    "gl",
+    "gu",
+    "ha",
+    "haw",
+    "he",
+    "hi",
+    "hr",
+    "ht",
+    "hu",
+    "hy",
+    "id",
+    "is",
+    "it",
+    "ja",
+    "jw",
+    "ka",
+    "kk",
+    "km",
+    "kn",
+    "ko",
+    "la",
+    "lb",
+    "ln",
+    "lo",
+    "lt",
+    "lv",
+    "mg",
+    "mi",
+    "mk",
+    "ml",
+    "mn",
+    "mr",
+    "ms",
+    "mt",
+    "my",
+    "ne",
+    "nl",
+    "nn",
+    "no",
+    "oc",
+    "pa",
+    "pl",
+    "ps",
+    "pt",
+    "ro",
+    "ru",
+    "sa",
+    "sd",
+    "si",
+    "sk",
+    "sl",
+    "sn",
+    "so",
+    "sq",
+    "sr",
+    "su",
+    "sv",
+    "sw",
+    "ta",
+    "te",
+    "tg",
+    "th",
+    "tk",
+    "tl",
+    "tr",
+    "tt",
+    "uk",
+    "ur",
+    "uz",
+    "vi",
+    "yi",
+    "yo",
+    "zh",
+]
 TRANSCRIPT_SPEAKER_MAPPING = {
     0: "A",
     1: "B",

diff --git a/src/wordcab/core_objects/__init__.py b/src/wordcab/core_objects/__init__.py
@@ -14,7 +14,7 @@
 
 """Wordcab API Core Objects."""
 
-from .job import BaseJob, ExtractJob, JobSettings, ListJobs, SummarizeJob
+from .job import BaseJob, ExtractJob, JobSettings, ListJobs, SummarizeJob, TranscribeJob
 from .source import (
     AssemblyAISource,
     AudioSource,
@@ -23,9 +23,9 @@
     GenericSource,
     InMemorySource,
     RevSource,
-    SignedURLSource,
     VTTSource,
     WordcabTranscriptSource,
+    YoutubeSource,
 )
 from .stats import Stats
 from .summary import BaseSummary, ListSummaries, StructuredSummary
@@ -47,11 +47,12 @@
     "ListSummaries",
     "ListTranscripts",
     "RevSource",
-    "SignedURLSource",
     "Stats",
     "StructuredSummary",
     "SummarizeJob",
+    "TranscribeJob",
     "TranscriptUtterance",
     "VTTSource",
     "WordcabTranscriptSource",
+    "YoutubeSource",
 ]
diff --git a/src/wordcab/core_objects/job.py b/src/wordcab/core_objects/job.py
@@ -107,6 +107,19 @@ def __post_init__(self) -> None:
         self.available_status = SUMMARIZE_AVAILABLE_STATUS
 
 
+@dataclass
+class TranscribeJob(BaseJob):
+    """Wordcab API TranscribeJob object."""
+
+    audio_duration: int = field(default=0)
+
+    def __post_init__(self) -> None:
+        """Post-init."""
+        super().__post_init__()
+        self._job_type = "TranscribeJob"
+        self.available_status = SUMMARIZE_AVAILABLE_STATUS
+
+
 @dataclass
 class ListJobs:
     """Wordcab API ListJobs object."""