Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update for whisper #488

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ClassTranscribeDatabase/global.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"sdk": {
"version": "8.0.201"
"version": "8.0"
}
}
2 changes: 1 addition & 1 deletion ClassTranscribeServer/global.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"sdk": {
"version": "8.0.401"
"version": "8.0"
}
}
1 change: 1 addition & 0 deletions PythonRpcServer/randomvoice_16kHz.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"text": " Hello? Hello? Hello?", "segments": [{"id": 0, "seek": 0, "start": 0.0, "end": 3.0, "text": " Hello? Hello? Hello?", "tokens": [50363, 18435, 30, 18435, 30, 18435, 30, 50513], "temperature": 0.0, "avg_logprob": -0.636968559688992, "compression_ratio": 1.1764705882352942, "no_speech_prob": 0.22877301275730133}], "language": "en"}
Binary file added PythonRpcServer/randomvoice_16kHz.wav
Binary file not shown.
12 changes: 12 additions & 0 deletions PythonRpcServer/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,18 @@ def LogWorker(logId, worker):


class PythonServerServicer(ct_pb2_grpc.PythonServerServicer):
# Transcribe it into a json string from the transcribe text
# Make it returns a json string
# change name to TranscribeRPC
def CaptionRPC(self, request, context):
#See CaptionRequest
print( f"CaptionRPC({request.logId};{request.refId};{request.filePath};{request.phraseHints};{request.courseHints};{request.outputLanguages})")
kalturaprovider = KalturaProvider()
result = LogWorker(f"CaptionRPC({request.filePath})", lambda: kalturaprovider.getCaptions(request.refId))
return ct_pb2.JsonString(json = result)



def GetScenesRPC(self, request, context):
raise NotImplementedError('Implementation now in pyapi')
# res = scenedetector.find_scenes(request.filePath)
Expand Down
57 changes: 57 additions & 0 deletions PythonRpcServer/transcribe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import subprocess
import os
import json
import re

def transcribe_audio_with_whisper(audio_file_path):
if not os.path.exists(audio_file_path):
raise FileNotFoundError(f"Audio file {audio_file_path} does not exist.")

command = [
"whisper",
audio_file_path,
"--model", "base.en",
"--output_format", "json"
]

try:
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)

print("Whisper Output:")
print(result.stdout)

formatted_data = {"en": []}

segments = result.stdout.strip().split('\n\n')
for segment in segments:
match = re.search(r'\[(\d+:\d+\.\d+)\s+-->\s+(\d+:\d+\.\d+)\]\s+(.*)', segment)
if match:
start_time = match.group(1)
end_time = match.group(2)
text = match.group(3).strip()

formatted_data["en"].append({
"starttime": start_time,
"endtime": end_time,
"caption": text
})

return formatted_data

except subprocess.CalledProcessError as e:
print(f"Error during transcription: {e.stderr}")
return None

except Exception as e:
print(f"An unexpected error occurred: {e}")
return None

if __name__ == "__main__":
audio_file = "randomvoice_16kHz.wav"

transcription = transcribe_audio_with_whisper(audio_file)

if transcription:
print(json.dumps(transcription, indent=4))
else:
print("Transcription failed.")
4 changes: 4 additions & 0 deletions TaskEngine.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
FROM mcr.microsoft.com/dotnet/sdk:8.0-bookworm-slim as build
# See https://mcr.microsoft.com/en-us/product/dotnet/sdk/tags
#See more comments in API.Dockerfile
# RUN ls
RUN dotnet --list-sdks

WORKDIR /
RUN git clone https://github.com/eficode/wait-for.git

WORKDIR /src
COPY ./ClassTranscribeDatabase/ClassTranscribeDatabase.csproj ./ClassTranscribeDatabase/ClassTranscribeDatabase.csproj
# --verbosity normal|diagnostic


RUN dotnet restore --verbosity diagnostic ./ClassTranscribeDatabase/ClassTranscribeDatabase.csproj

COPY ./TaskEngine/TaskEngine.csproj ./TaskEngine/TaskEngine.csproj
Expand Down
5 changes: 3 additions & 2 deletions TaskEngine/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ public static void SetupServices()
.AddSingleton<DownloadPlaylistInfoTask>()
.AddSingleton<DownloadMediaTask>()
.AddSingleton<ConvertVideoToWavTask>()
.AddSingleton<TranscriptionTask>()
.AddSingleton<LocalTranscriptionTask>()
.AddSingleton<AzureTranscriptionTask>()
.AddSingleton<QueueAwakerTask>()
// .AddSingleton<GenerateVTTFileTask>()
.AddSingleton<RpcClient>()
Expand Down Expand Up @@ -175,7 +176,7 @@ static void createTaskQueues() {
// Transcription Related
_logger.LogInformation($"Creating TranscriptionTask consumers. Concurrency={concurrent_transcriptions} ");

_serviceProvider.GetService<TranscriptionTask>().Consume(concurrent_transcriptions);
_serviceProvider.GetService<LocalTranscriptionTask>().Consume(concurrent_transcriptions);

// no more! - _serviceProvider.GetService<GenerateVTTFileTask>().Consume(concurrent_transcriptions);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,17 @@ namespace TaskEngine.Tasks
/// This task produces the transcriptions for a Video item.
/// </summary>
[SuppressMessage("Microsoft.Performance", "CA1812:MarkMembersAsStatic")] // This class is never directly instantiated
class TranscriptionTask : RabbitMQTask<string>
class AzureTranscriptionTask : RabbitMQTask<string>
{

private readonly MSTranscriptionService _msTranscriptionService;
// nope private readonly GenerateVTTFileTask _generateVTTFileTask;
private readonly CaptionQueries _captionQueries;


public TranscriptionTask(RabbitMQConnection rabbitMQ, MSTranscriptionService msTranscriptionService,
public AzureTranscriptionTask(RabbitMQConnection rabbitMQ, MSTranscriptionService msTranscriptionService,
// GenerateVTTFileTask generateVTTFileTask,
ILogger<TranscriptionTask> logger, CaptionQueries captionQueries)
ILogger<AzureTranscriptionTask> logger, CaptionQueries captionQueries)
: base(rabbitMQ, TaskType.TranscribeVideo, logger)
{
_msTranscriptionService = msTranscriptionService;
Expand Down
9 changes: 4 additions & 5 deletions TaskEngine/Tasks/ConvertVideoToWavTask.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ namespace TaskEngine.Tasks
class ConvertVideoToWavTask : RabbitMQTask<string>
{
private readonly RpcClient _rpcClient;
private readonly TranscriptionTask _transcriptionTask;
private readonly LocalTranscriptionTask _localTranscriptionTask;

public ConvertVideoToWavTask(RabbitMQConnection rabbitMQ, RpcClient rpcClient, TranscriptionTask transcriptionTask, ILogger<ConvertVideoToWavTask> logger)
public ConvertVideoToWavTask(RabbitMQConnection rabbitMQ, RpcClient rpcClient, LocalTranscriptionTask localTranscriptionTask, ILogger<ConvertVideoToWavTask> logger)
: base(rabbitMQ, TaskType.ConvertMedia, logger)
{
_rpcClient = rpcClient;
_transcriptionTask = transcriptionTask;
_localTranscriptionTask = localTranscriptionTask;
}

protected override Task OnConsume(string videoId, TaskParameters taskParameters, ClientActiveTasks cleanup)
Expand Down Expand Up @@ -72,11 +72,10 @@ private async Task OldOnConsumeNotUsed(string videoId)
videoLatest.Audio = fileRecord;
await _context.SaveChangesAsync();


// If no transcriptions present, produce transcriptions.
if (!videoLatest.Transcriptions.Any())
{
_transcriptionTask.Publish(videoLatest.Id);
_localTranscriptionTask.Publish(videoLatest.Id);
}
}
}
Expand Down
187 changes: 187 additions & 0 deletions TaskEngine/Tasks/LocalTranscriptionTask.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Logging;
using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Linq;
using System.Threading.Tasks;
using Grpc.Core;
using Newtonsoft.Json.Linq;


using ClassTranscribeDatabase;
using ClassTranscribeDatabase.Models;
using ClassTranscribeDatabase.Services;

using static ClassTranscribeDatabase.CommonUtils;

#pragma warning disable CA2007
// https://learn.microsoft.com/en-us/dotnet/fundamentals/code-analysis/quality-rules/ca2007
// We are okay awaiting on a task in the same thread

namespace TaskEngine.Tasks
{
/// <summary>
/// This task produces the transcriptions for a Video item.
/// </summary>
[SuppressMessage("Microsoft.Performance", "CA1812:MarkMembersAsStatic")] // This class is never directly instantiated
class LocalTranscriptionTask : RabbitMQTask<string>
{

private readonly CaptionQueries _captionQueries;
private readonly RpcClient _rpcClient;


public LocalTranscriptionTask(RabbitMQConnection rabbitMQ,
RpcClient rpcClient,
// GenerateVTTFileTask generateVTTFileTask,
ILogger<LocalTranscriptionTask> logger, CaptionQueries captionQueries)
: base(rabbitMQ, TaskType.TranscribeVideo, logger)
{
_rpcClient = rpcClient;
_captionQueries = captionQueries;
}

protected async override Task OnConsume(string videoId, TaskParameters taskParameters, ClientActiveTasks cleanup)
{
RegisterTask(cleanup, videoId); // may throw AlreadyInProgress exception

const string SOURCEINTERNALREF= "ClassTranscribe/Local"; // Do not change me; this is a key inside the database
// to indicate the source of the captions was this code


using (var _context = CTDbContext.CreateDbContext())
{

// TODO: taskParameters.Force should wipe all captions and reset the Transcription Status

Video video = await _context.Videos.Include(v => v.Video1).Where(v => v.Id == videoId).FirstAsync();
// ! Note the 'Include' ; we don't build the whole tree of related Entities

if (video.TranscriptionStatus == Video.TranscriptionStatusMessages.NOERROR)
{
GetLogger().LogInformation($"{videoId}:Skipping Transcribing of- already complete");
return;
}
var medias = await _context.Medias.Include(m=>m.Playlist).Where(m=>m.VideoId == videoId && m.Playlist != null).ToListAsync();
if(medias.Count == 0) {
GetLogger().LogInformation($"{videoId}:Skipping Transcribing - no media / playlist cares about this video");
return;
}

GetLogger().LogInformation($"{videoId}: Has new Phrase Hints: {video.HasPhraseHints()}");

string phraseHints = "";
if (video.HasPhraseHints()) {
var data = await _context.TextData.FindAsync(video.PhraseHintsDataId);
phraseHints = data.Text;
} else
{ // deprecated
phraseHints = video.PhraseHints ?? "";
}

GetLogger().LogInformation($"{videoId}:Using Phrase Hints length = {phraseHints.Length}");
// GetKey can throw if the video.Id is currently being transcribed
// However registerTask should have already detected that
var key = TaskEngineGlobals.KeyProvider.GetKey(video.Id);

video.TranscribingAttempts += 10;
await _context.SaveChangesAsync();
GetLogger().LogInformation($"{videoId}: Updated TranscribingAttempts = {video.TranscribingAttempts}");
try
{

GetLogger().LogInformation($"{videoId}: Calling RecognitionWithVideoStreamAsync");

var request = new CTGrpc.TranscriptionRequest
{
LogId = videoId,
FilePath = video.Video1.VMPath,
Model = "en",
Language = "en"
// PhraseHints = phraseHints,
// CourseHints = "",
// OutputLanguages = "en"
};
var jsonString = "";
try {
jsonString = (await _rpcClient.PythonServerClient.TranscribeAudioRPCAsync(request)).Json;
}
catch (RpcException e)
{
if (e.Status.StatusCode == StatusCode.InvalidArgument)
{
GetLogger().LogError($"TranscribeAudioRPCAsync=({videoId}):{e.Message}");
}
return;
} finally {
GetLogger().LogInformation($"{videoId} Transcribe - rpc complete");
TaskEngineGlobals.KeyProvider.ReleaseKey(key, video.Id);
}

JObject jObject = JObject.Parse(jsonString);
// JArray jArray = JArray.Parse(jsonString);
var theLanguage = jObject["result"]["language"].ToString(Newtonsoft.Json.Formatting.None);
var theCaptionsAsJson = jObject["transcription"];

var theCaptions = new List<Caption>();
int cueCount = 0;

foreach (var jsonCue in theCaptionsAsJson) {
var caption = new Caption() {
Index = cueCount ++,
Begin = TimeSpan.Parse(jsonCue["timestamps"]["from"].ToString(Newtonsoft.Json.Formatting.None)),
End = TimeSpan.Parse(jsonCue["timestamps"]["to"].ToString(Newtonsoft.Json.Formatting.None)) ,
Text = jsonCue["text"] .ToString(Newtonsoft.Json.Formatting.None)
};

theCaptions.Add(caption);
}
if (theCaptions.Count > 0)
{
GetLogger().LogInformation($"{videoId}: Created {theCaptions.Count} captions objects");

var t = _context.Transcriptions.SingleOrDefault(t => t.VideoId == video.Id && t.SourceInternalRef == SOURCEINTERNALREF && t.Language == theLanguage && t.TranscriptionType == TranscriptionType.Caption);
GetLogger().LogInformation($"Find Existing Transcriptions null={t == null}");
// Did we get the default or an existing Transcription entity?
if (t == null)
{
t = new Transcription()
{
TranscriptionType = TranscriptionType.Caption,
Captions = theCaptions,
Language = theLanguage,
VideoId = video.Id,
Label = $"{theLanguage} (ClassTranscribe)",
SourceInternalRef = SOURCEINTERNALREF, //
SourceLabel = "ClassTranscribe (Local" + (phraseHints.Length>0 ?" with phrase hints)" : ")")
// Todo store the entire Whisper result here
};
_context.Add(t);
}
else
{
t.Captions.AddRange(theCaptions);
}
}


video.TranscriptionStatus = "NoError";
// video.JsonMetadata["LastSuccessfulTime"] = result.LastSuccessTime.ToString();

GetLogger().LogInformation($"{videoId}: Saving captions");
await _context.SaveChangesAsync();
}
catch (Exception ex)
{
GetLogger().LogError(ex, $"{videoId}: Transcription Exception:${ex.StackTrace}");
video.TranscribingAttempts += 1000;
await _context.SaveChangesAsync();
throw;
}

}
}

}
}
4 changes: 2 additions & 2 deletions TaskEngine/Tasks/QueueAwakerTask.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class QueueAwakerTask : RabbitMQTask<JObject>
private readonly DownloadPlaylistInfoTask _downloadPlaylistInfoTask;
private readonly DownloadMediaTask _downloadMediaTask;
// private readonly ConvertVideoToWavTask _convertVideoToWavTask;
private readonly TranscriptionTask _transcriptionTask;
private readonly LocalTranscriptionTask _transcriptionTask;
// nope private readonly GenerateVTTFileTask _generateVTTFileTask;
private readonly ProcessVideoTask _processVideoTask;
private readonly SceneDetectionTask _sceneDetectionTask;
Expand All @@ -39,7 +39,7 @@ public QueueAwakerTask() { }

public QueueAwakerTask(RabbitMQConnection rabbitMQ, DownloadPlaylistInfoTask downloadPlaylistInfoTask,
DownloadMediaTask downloadMediaTask,
TranscriptionTask transcriptionTask, ProcessVideoTask processVideoTask,
LocalTranscriptionTask transcriptionTask, ProcessVideoTask processVideoTask,
// GenerateVTTFileTask generateVTTFileTask,
SceneDetectionTask sceneDetectionTask,
CreateBoxTokenTask createBoxTokenTask,// UpdateBoxTokenTask updateBoxTokenTask,
Expand Down
Loading
Loading