Skip to content

Commit

Permalink
Add the transcribe endpoint to the client (#314)
Browse files Browse the repository at this point in the history
* add TranscribeJob + start_transcription for the api

* add fct to check if youtube link

* add YoutubeSource

* add start_transcription method

* fix + tests

* fix quality
  • Loading branch information
Thomas Chaigneau authored Sep 25, 2023
1 parent 6539c6a commit b94e727
Show file tree
Hide file tree
Showing 14 changed files with 566 additions and 33 deletions.
57 changes: 57 additions & 0 deletions src/wordcab/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from .client import Client
from .core_objects import (
AudioSource,
BaseSource,
BaseSummary,
BaseTranscript,
Expand All @@ -28,7 +29,9 @@
ListTranscripts,
Stats,
SummarizeJob,
TranscribeJob,
WordcabTranscriptSource,
YoutubeSource,
)


Expand Down Expand Up @@ -231,6 +234,60 @@ def start_summary(
)


@no_type_check
def start_transcription(
source_object: Union[AudioSource, YoutubeSource],
display_name: str,
source_lang: str,
diarization: bool = False,
ephemeral_data: bool = False,
only_api: bool = True,
tags: Union[str, List[str], None] = None,
api_key: Union[str, None] = None,
) -> TranscribeJob:
"""
Start a transcription job.
Parameters
----------
source_object : AudioSource
The source object to transcribe.
display_name : str
The display name of the transcription. This is useful for retrieving the job later.
source_lang : str
The language of the source audio.
diarization : bool
Whether to perform speaker diarization. The default is False.
ephemeral_data : bool
Whether to delete the data after the transcription is complete. The default is False. If False, the data will be
kept on Wordcab's servers. You can delete the data at any time, check the documentation here:
https://docs.wordcab.com/docs/enabling-ephemeral-data
only_api : bool
Whether to only use the API to transcribe the audio. The default is True.
tags : str or list of str, optional
The tags to add to the job. The default is None. If None, no tags will be added.
api_key : str, optional
The API key to use. The default is None. If None, the API key will be
automatically retrieved from the environment variable WORDCAB_API_KEY.
Returns
-------
TranscribeJob
The transcribe job object.
"""
return request(
method="start_transcription",
source_object=source_object,
display_name=display_name,
source_lang=source_lang,
diarization=diarization,
ephemeral_data=ephemeral_data,
only_api=only_api,
tags=tags,
api_key=api_key,
)


@no_type_check
def list_jobs(
page_size: int = 100,
Expand Down
76 changes: 76 additions & 0 deletions src/wordcab/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@
SUMMARY_PIPELINES,
SUMMARY_TYPES,
TARGET_LANG,
TRANSCRIBE_LANGUAGE_CODES,
)
from .core_objects import (
AudioSource,
BaseSource,
BaseSummary,
BaseTranscript,
Expand All @@ -44,8 +46,10 @@
Stats,
StructuredSummary,
SummarizeJob,
TranscribeJob,
TranscriptUtterance,
WordcabTranscriptSource,
YoutubeSource,
)
from .login import get_token
from .utils import (
Expand Down Expand Up @@ -418,6 +422,78 @@ def start_summary( # noqa: C901
else:
raise ValueError(r.text)

def start_transcription(
self,
source_object: Union[AudioSource, YoutubeSource],
display_name: str,
source_lang: str,
diarization: bool = False,
ephemeral_data: bool = False,
only_api: Optional[bool] = True,
tags: Union[str, List[str], None] = None,
api_key: Union[str, None] = None,
) -> TranscribeJob:
"""Start a transcription job."""
if source_lang not in TRANSCRIBE_LANGUAGE_CODES:
raise ValueError(f"""
Invalid source language: {source_lang}. Source language must be one of {TRANSCRIBE_LANGUAGE_CODES}.
""")

headers = {
"Authorization": f"Bearer {self.api_key}",
}

params = {
"display_name": display_name,
"source_lang": source_lang,
"diarization": str(diarization).lower(),
"ephemeral_data": str(ephemeral_data).lower(),
}
if tags:
params["tags"] = _format_tags(tags)

if isinstance(source_object, AudioSource):
_data = source_object.file_object

if source_object.file_object is None: # URL source
params["url_type"] = "audio_url"
params["url"] = source_object.url
else: # File object source
headers["Content-Disposition"] = (
f'attachment; filename="{source_object._stem}"'
)
headers["Content-Type"] = f"audio/{source_object._suffix}"

else: # Youtube source
params["url_type"] = source_object.source_type
params["url"] = source_object.url
_data = None

r = requests.post(
"https://wordcab.com/api/v1/transcribe",
headers=headers,
params=params,
data=_data,
timeout=self.timeout,
)

if r.status_code == 200 or r.status_code == 201:
logger.info("Transcription job started.")
return TranscribeJob(
display_name=display_name,
job_name=r.json()["job_name"],
source=source_object.source,
source_lang=source_lang,
settings=JobSettings(
pipeline="transcribe",
ephemeral_data=ephemeral_data,
only_api=only_api,
split_long_utterances=False,
),
)
else:
raise ValueError(r.text)

def list_jobs(
self,
page_size: Optional[int] = 100,
Expand Down
103 changes: 102 additions & 1 deletion src/wordcab/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,11 @@
"generic": "GenericSource",
"audio": "AudioSource",
"wordcab_transcript": "WordcabTranscriptSource",
"signed_url": "SignedUrlSource",
"assembly_ai": "AssemblyAISource",
"deepgram": "DeepgramSource",
"rev_ai": "RevSource",
"vtt": "VTTSource",
"youtube": "YoutubeSource",
}
SUMMARIZE_AVAILABLE_STATUS = [
"Deleted",
Expand All @@ -69,6 +69,107 @@
SUMMARY_PIPELINES = ["transcribe", "summarize"]
SUMMARY_TYPES = ["brief", "conversational", "narrative", "no_speaker"]
TARGET_LANG = ["de", "en", "es", "fr", "it", "pt", "sv"]
TRANSCRIBE_LANGUAGE_CODES = [
"af",
"am",
"ar",
"as",
"az",
"ba",
"be",
"bg",
"bn",
"bo",
"br",
"bs",
"ca",
"cs",
"cy",
"da",
"de",
"el",
"en",
"es",
"et",
"eu",
"fa",
"fi",
"fo",
"fr",
"gl",
"gu",
"ha",
"haw",
"he",
"hi",
"hr",
"ht",
"hu",
"hy",
"id",
"is",
"it",
"ja",
"jw",
"ka",
"kk",
"km",
"kn",
"ko",
"la",
"lb",
"ln",
"lo",
"lt",
"lv",
"mg",
"mi",
"mk",
"ml",
"mn",
"mr",
"ms",
"mt",
"my",
"ne",
"nl",
"nn",
"no",
"oc",
"pa",
"pl",
"ps",
"pt",
"ro",
"ru",
"sa",
"sd",
"si",
"sk",
"sl",
"sn",
"so",
"sq",
"sr",
"su",
"sv",
"sw",
"ta",
"te",
"tg",
"th",
"tk",
"tl",
"tr",
"tt",
"uk",
"ur",
"uz",
"vi",
"yi",
"yo",
"zh",
]
TRANSCRIPT_SPEAKER_MAPPING = {
0: "A",
1: "B",
Expand Down
7 changes: 4 additions & 3 deletions src/wordcab/core_objects/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

"""Wordcab API Core Objects."""

from .job import BaseJob, ExtractJob, JobSettings, ListJobs, SummarizeJob
from .job import BaseJob, ExtractJob, JobSettings, ListJobs, SummarizeJob, TranscribeJob
from .source import (
AssemblyAISource,
AudioSource,
Expand All @@ -23,9 +23,9 @@
GenericSource,
InMemorySource,
RevSource,
SignedURLSource,
VTTSource,
WordcabTranscriptSource,
YoutubeSource,
)
from .stats import Stats
from .summary import BaseSummary, ListSummaries, StructuredSummary
Expand All @@ -47,11 +47,12 @@
"ListSummaries",
"ListTranscripts",
"RevSource",
"SignedURLSource",
"Stats",
"StructuredSummary",
"SummarizeJob",
"TranscribeJob",
"TranscriptUtterance",
"VTTSource",
"WordcabTranscriptSource",
"YoutubeSource",
]
13 changes: 13 additions & 0 deletions src/wordcab/core_objects/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,19 @@ def __post_init__(self) -> None:
self.available_status = SUMMARIZE_AVAILABLE_STATUS


@dataclass
class TranscribeJob(BaseJob):
"""Wordcab API TranscribeJob object."""

audio_duration: int = field(default=0)

def __post_init__(self) -> None:
"""Post-init."""
super().__post_init__()
self._job_type = "TranscribeJob"
self.available_status = SUMMARIZE_AVAILABLE_STATUS


@dataclass
class ListJobs:
"""Wordcab API ListJobs object."""
Expand Down
Loading

0 comments on commit b94e727

Please sign in to comment.