classtranscribe · tyler232 · Sep 18, 2024 · Sep 18, 2024 · Sep 18, 2024 · Sep 18, 2024
diff --git a/ClassTranscribeDatabase/global.json b/ClassTranscribeDatabase/global.json
@@ -1,5 +1,5 @@
 {
   "sdk": {
-    "version": "8.0.201"
+    "version": "8.0"
   }
 }
diff --git a/ClassTranscribeServer/global.json b/ClassTranscribeServer/global.json
@@ -1,5 +1,5 @@
 {
   "sdk": {
-    "version": "8.0.401"
+    "version": "8.0"
   }
 }
diff --git a/PythonRpcServer/randomvoice_16kHz.json b/PythonRpcServer/randomvoice_16kHz.json
@@ -0,0 +1 @@
+{"text": " Hello? Hello? Hello?", "segments": [{"id": 0, "seek": 0, "start": 0.0, "end": 3.0, "text": " Hello? Hello? Hello?", "tokens": [50363, 18435, 30, 18435, 30, 18435, 30, 50513], "temperature": 0.0, "avg_logprob": -0.636968559688992, "compression_ratio": 1.1764705882352942, "no_speech_prob": 0.22877301275730133}], "language": "en"}
diff --git a/PythonRpcServer/randomvoice_16kHz.wav b/PythonRpcServer/randomvoice_16kHz.wav
diff --git a/PythonRpcServer/server.py b/PythonRpcServer/server.py
@@ -41,6 +41,18 @@ def LogWorker(logId, worker):
 
 
 class PythonServerServicer(ct_pb2_grpc.PythonServerServicer):
+    # Transcribe it into a json string from the transcribe text
+    # Make it returns a json string
+    # change name to TranscribeRPC
+    def CaptionRPC(self, request, context):
+        #See CaptionRequest
+        print( f"CaptionRPC({request.logId};{request.refId};{request.filePath};{request.phraseHints};{request.courseHints};{request.outputLanguages})")
+        kalturaprovider = KalturaProvider()
+        result = LogWorker(f"CaptionRPC({request.filePath})", lambda: kalturaprovider.getCaptions(request.refId))
+        return  ct_pb2.JsonString(json = result)
+
+
+
     def GetScenesRPC(self, request, context):
         raise NotImplementedError('Implementation now in pyapi')
 #        res = scenedetector.find_scenes(request.filePath)

diff --git a/PythonRpcServer/transcribe.py b/PythonRpcServer/transcribe.py
@@ -0,0 +1,57 @@
+import subprocess
+import os
+import json
+import re
+
+def transcribe_audio_with_whisper(audio_file_path):
+    if not os.path.exists(audio_file_path):
+        raise FileNotFoundError(f"Audio file {audio_file_path} does not exist.")
+
+    command = [
+        "whisper",
+        audio_file_path,
+        "--model", "base.en",
+        "--output_format", "json"
+    ]
+
+    try:
+        result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)
+
+        print("Whisper Output:")
+        print(result.stdout)
+
+        formatted_data = {"en": []}
+
+        segments = result.stdout.strip().split('\n\n')
+        for segment in segments:
+            match = re.search(r'\[(\d+:\d+\.\d+)\s+-->\s+(\d+:\d+\.\d+)\]\s+(.*)', segment)
+            if match:
+                start_time = match.group(1)
+                end_time = match.group(2)
+                text = match.group(3).strip()
+
+                formatted_data["en"].append({
+                    "starttime": start_time,
+                    "endtime": end_time,
+                    "caption": text
+                })
+
+        return formatted_data
+
+    except subprocess.CalledProcessError as e:
+        print(f"Error during transcription: {e.stderr}")
+        return None
+
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return None
+
+if __name__ == "__main__":
+    audio_file = "randomvoice_16kHz.wav"
+
+    transcription = transcribe_audio_with_whisper(audio_file)
+
+    if transcription:
+        print(json.dumps(transcription, indent=4))
+    else:
+        print("Transcription failed.")
diff --git a/TaskEngine.Dockerfile b/TaskEngine.Dockerfile
@@ -1,13 +1,17 @@
 FROM mcr.microsoft.com/dotnet/sdk:8.0-bookworm-slim as build
 # See https://mcr.microsoft.com/en-us/product/dotnet/sdk/tags
 #See more comments in API.Dockerfile
+# RUN ls
+RUN dotnet --list-sdks
 
 WORKDIR /
 RUN git clone https://github.com/eficode/wait-for.git
 
 WORKDIR /src
 COPY ./ClassTranscribeDatabase/ClassTranscribeDatabase.csproj ./ClassTranscribeDatabase/ClassTranscribeDatabase.csproj
 # --verbosity normal|diagnostic
+
+
 RUN dotnet restore --verbosity diagnostic ./ClassTranscribeDatabase/ClassTranscribeDatabase.csproj
 
 COPY ./TaskEngine/TaskEngine.csproj ./TaskEngine/TaskEngine.csproj

diff --git a/TaskEngine/Program.cs b/TaskEngine/Program.cs
@@ -81,7 +81,8 @@ public static void SetupServices()
                 .AddSingleton<DownloadPlaylistInfoTask>()
                 .AddSingleton<DownloadMediaTask>()
                 .AddSingleton<ConvertVideoToWavTask>()
-                .AddSingleton<TranscriptionTask>()
+                .AddSingleton<LocalTranscriptionTask>()
+                .AddSingleton<AzureTranscriptionTask>()
                 .AddSingleton<QueueAwakerTask>()
                 // .AddSingleton<GenerateVTTFileTask>()
                 .AddSingleton<RpcClient>()
@@ -175,7 +176,7 @@ static void createTaskQueues() {
             // Transcription Related
             _logger.LogInformation($"Creating TranscriptionTask consumers. Concurrency={concurrent_transcriptions} ");
 
-            _serviceProvider.GetService<TranscriptionTask>().Consume(concurrent_transcriptions);
+            _serviceProvider.GetService<LocalTranscriptionTask>().Consume(concurrent_transcriptions);
 
             // no more! - _serviceProvider.GetService<GenerateVTTFileTask>().Consume(concurrent_transcriptions);
 

diff --git a/TaskEngine/Tasks/TranscriptionTask.cs → TaskEngine/Tasks/AzureTranscriptionTask.cs b/TaskEngine/Tasks/TranscriptionTask.cs → TaskEngine/Tasks/AzureTranscriptionTask.cs
@@ -21,17 +21,17 @@ namespace TaskEngine.Tasks
     /// This task produces the transcriptions for a Video item.
     /// </summary>
     [SuppressMessage("Microsoft.Performance", "CA1812:MarkMembersAsStatic")] // This class is never directly instantiated
-    class TranscriptionTask : RabbitMQTask<string>
+    class AzureTranscriptionTask : RabbitMQTask<string>
     {
 
         private readonly MSTranscriptionService _msTranscriptionService;
         // nope private readonly GenerateVTTFileTask _generateVTTFileTask;
         private readonly CaptionQueries _captionQueries;
 
 
-        public TranscriptionTask(RabbitMQConnection rabbitMQ, MSTranscriptionService msTranscriptionService,
+        public AzureTranscriptionTask(RabbitMQConnection rabbitMQ, MSTranscriptionService msTranscriptionService,
             // GenerateVTTFileTask generateVTTFileTask, 
-            ILogger<TranscriptionTask> logger, CaptionQueries captionQueries)
+            ILogger<AzureTranscriptionTask> logger, CaptionQueries captionQueries)
             : base(rabbitMQ, TaskType.TranscribeVideo, logger)
         {
             _msTranscriptionService = msTranscriptionService;

diff --git a/TaskEngine/Tasks/ConvertVideoToWavTask.cs b/TaskEngine/Tasks/ConvertVideoToWavTask.cs
@@ -21,13 +21,13 @@ namespace TaskEngine.Tasks
     class ConvertVideoToWavTask : RabbitMQTask<string>
     {
         private readonly RpcClient _rpcClient;
-        private readonly TranscriptionTask _transcriptionTask;
+        private readonly LocalTranscriptionTask _localTranscriptionTask;
 
-        public ConvertVideoToWavTask(RabbitMQConnection rabbitMQ, RpcClient rpcClient, TranscriptionTask transcriptionTask, ILogger<ConvertVideoToWavTask> logger)
+        public ConvertVideoToWavTask(RabbitMQConnection rabbitMQ, RpcClient rpcClient, LocalTranscriptionTask localTranscriptionTask, ILogger<ConvertVideoToWavTask> logger)
             : base(rabbitMQ, TaskType.ConvertMedia, logger)
         {
             _rpcClient = rpcClient;
-            _transcriptionTask = transcriptionTask;
+            _localTranscriptionTask = localTranscriptionTask;
         }
 
         protected override Task OnConsume(string videoId, TaskParameters taskParameters, ClientActiveTasks cleanup)
@@ -72,11 +72,10 @@ private async Task OldOnConsumeNotUsed(string videoId)
                         videoLatest.Audio = fileRecord;
                         await _context.SaveChangesAsync();
 
-
                         // If no transcriptions present, produce transcriptions.
                         if (!videoLatest.Transcriptions.Any())
                         {
-                            _transcriptionTask.Publish(videoLatest.Id);
+                            _localTranscriptionTask.Publish(videoLatest.Id);
                         }
                     }
                 }

diff --git a/TaskEngine/Tasks/LocalTranscriptionTask.cs b/TaskEngine/Tasks/LocalTranscriptionTask.cs
@@ -0,0 +1,187 @@
+using Microsoft.EntityFrameworkCore;
+using Microsoft.Extensions.Logging;
+using System;
+using System.Collections.Generic;
+using System.Diagnostics.CodeAnalysis;
+using System.Linq;
+using System.Threading.Tasks;
+using Grpc.Core;
+using Newtonsoft.Json.Linq;
+
+
+using ClassTranscribeDatabase;
+using ClassTranscribeDatabase.Models;
+using ClassTranscribeDatabase.Services;
+
+using static ClassTranscribeDatabase.CommonUtils;
+
+#pragma warning disable CA2007
+// https://learn.microsoft.com/en-us/dotnet/fundamentals/code-analysis/quality-rules/ca2007
+// We are okay awaiting on a task in the same thread
+
+namespace TaskEngine.Tasks
+{
+    /// <summary>
+    /// This task produces the transcriptions for a Video item.
+    /// </summary>
+    [SuppressMessage("Microsoft.Performance", "CA1812:MarkMembersAsStatic")] // This class is never directly instantiated
+    class LocalTranscriptionTask : RabbitMQTask<string>
+    {
+
+        private readonly CaptionQueries _captionQueries;
+        private readonly RpcClient _rpcClient;
+
+
+        public LocalTranscriptionTask(RabbitMQConnection rabbitMQ, 
+            RpcClient rpcClient,
+            // GenerateVTTFileTask generateVTTFileTask, 
+            ILogger<LocalTranscriptionTask> logger, CaptionQueries captionQueries)
+            : base(rabbitMQ, TaskType.TranscribeVideo, logger)
+        {
+            _rpcClient = rpcClient;
+            _captionQueries = captionQueries;
+        }
+
+         protected async override Task OnConsume(string videoId, TaskParameters taskParameters, ClientActiveTasks cleanup)
+        {
+            RegisterTask(cleanup, videoId); // may throw AlreadyInProgress exception
+
+            const string SOURCEINTERNALREF= "ClassTranscribe/Local"; // Do not change me; this is a key inside the database
+            // to indicate the source of the captions was this code
+
+
+            using (var _context = CTDbContext.CreateDbContext())
+            {
+
+                // TODO: taskParameters.Force should wipe all captions and reset the Transcription Status
+
+                Video video = await _context.Videos.Include(v => v.Video1).Where(v => v.Id == videoId).FirstAsync();
+                // ! Note the 'Include' ; we don't build the whole tree of related Entities
+
+                if (video.TranscriptionStatus == Video.TranscriptionStatusMessages.NOERROR)
+                {
+                    GetLogger().LogInformation($"{videoId}:Skipping Transcribing of- already complete");
+                    return;
+                }
+                var medias = await  _context.Medias.Include(m=>m.Playlist).Where(m=>m.VideoId == videoId && m.Playlist != null).ToListAsync();
+                if(medias.Count == 0) {
+                    GetLogger().LogInformation($"{videoId}:Skipping Transcribing - no media / playlist cares about this video");
+                    return;
+                }
+
+                GetLogger().LogInformation($"{videoId}: Has new Phrase Hints: {video.HasPhraseHints()}");
+
+                string phraseHints = "";
+                if (video.HasPhraseHints()) {
+                    var data = await _context.TextData.FindAsync(video.PhraseHintsDataId);
+                    phraseHints = data.Text;
+                } else
+                { // deprecated
+                    phraseHints = video.PhraseHints ?? "";
+                }
+
+                GetLogger().LogInformation($"{videoId}:Using Phrase Hints length = {phraseHints.Length}");
+                // GetKey can throw if the video.Id is currently being transcribed
+                // However registerTask should have already detected that
+                var key = TaskEngineGlobals.KeyProvider.GetKey(video.Id);
+
+                video.TranscribingAttempts += 10;
+                await _context.SaveChangesAsync();
+                GetLogger().LogInformation($"{videoId}: Updated TranscribingAttempts = {video.TranscribingAttempts}");
+                try
+                {
+
+                    GetLogger().LogInformation($"{videoId}: Calling RecognitionWithVideoStreamAsync");
+
+                    var request = new CTGrpc.TranscriptionRequest
+                    {
+                        LogId = videoId,
+                        FilePath = video.Video1.VMPath,
+                        Model = "en",
+                        Language = "en"
+                        // PhraseHints = phraseHints,
+                        // CourseHints = "",
+                        // OutputLanguages = "en"
+                    };
+                    var jsonString = "";
+                    try {
+                        jsonString = (await _rpcClient.PythonServerClient.TranscribeAudioRPCAsync(request)).Json;
+                     }
+                    catch (RpcException e)
+                    {
+                        if (e.Status.StatusCode == StatusCode.InvalidArgument)
+                        {
+                            GetLogger().LogError($"TranscribeAudioRPCAsync=({videoId}):{e.Message}");
+                        }
+                        return;
+                    } finally {
+                        GetLogger().LogInformation($"{videoId} Transcribe - rpc complete");
+                        TaskEngineGlobals.KeyProvider.ReleaseKey(key, video.Id);
+                    }
+
+                    JObject jObject = JObject.Parse(jsonString);
+                    // JArray jArray = JArray.Parse(jsonString);
+                    var theLanguage = jObject["result"]["language"].ToString(Newtonsoft.Json.Formatting.None);
+                    var theCaptionsAsJson = jObject["transcription"];
+
+                    var theCaptions = new List<Caption>();
+                    int cueCount = 0; 
+
+                    foreach (var jsonCue in theCaptionsAsJson) {
+                        var caption = new Caption() {
+                            Index  = cueCount ++,
+                            Begin = TimeSpan.Parse(jsonCue["timestamps"]["from"].ToString(Newtonsoft.Json.Formatting.None)),
+                            End = TimeSpan.Parse(jsonCue["timestamps"]["to"].ToString(Newtonsoft.Json.Formatting.None)) ,
+                            Text = jsonCue["text"] .ToString(Newtonsoft.Json.Formatting.None)
+                        };
+
+                        theCaptions.Add(caption);
+                    }
+                    if (theCaptions.Count > 0)
+                    {
+                        GetLogger().LogInformation($"{videoId}: Created {theCaptions.Count} captions objects"); 
+
+                        var t = _context.Transcriptions.SingleOrDefault(t => t.VideoId == video.Id && t.SourceInternalRef == SOURCEINTERNALREF && t.Language == theLanguage && t.TranscriptionType == TranscriptionType.Caption);
+                        GetLogger().LogInformation($"Find Existing Transcriptions null={t == null}");
+                        // Did we get the default or an existing Transcription entity?
+                        if (t == null)
+                        {
+                            t = new Transcription()
+                            {
+                                TranscriptionType = TranscriptionType.Caption,
+                                Captions = theCaptions,
+                                Language = theLanguage,
+                                VideoId = video.Id,
+                                Label = $"{theLanguage} (ClassTranscribe)",
+                                SourceInternalRef = SOURCEINTERNALREF, // 
+                                SourceLabel = "ClassTranscribe (Local" + (phraseHints.Length>0 ?" with phrase hints)" : ")")
+                                // Todo store the entire Whisper result here
+                            };
+                            _context.Add(t);
+                        }
+                        else
+                        {
+                            t.Captions.AddRange(theCaptions);
+                        }
+                    }
+
+
+                    video.TranscriptionStatus = "NoError";
+                    // video.JsonMetadata["LastSuccessfulTime"] = result.LastSuccessTime.ToString();
+
+                    GetLogger().LogInformation($"{videoId}: Saving captions"); 
+                    await _context.SaveChangesAsync();                     
+                }
+                catch (Exception ex)
+                {
+                    GetLogger().LogError(ex, $"{videoId}: Transcription Exception:${ex.StackTrace}");
+                    video.TranscribingAttempts += 1000;
+                    await _context.SaveChangesAsync();
+                    throw;
+                }
+
+            }
+        }
+
+    }
+}
diff --git a/TaskEngine/Tasks/QueueAwakerTask.cs b/TaskEngine/Tasks/QueueAwakerTask.cs
@@ -22,7 +22,7 @@ class QueueAwakerTask : RabbitMQTask<JObject>
         private readonly DownloadPlaylistInfoTask _downloadPlaylistInfoTask;
         private readonly DownloadMediaTask _downloadMediaTask;
         // private readonly ConvertVideoToWavTask _convertVideoToWavTask;
-        private readonly TranscriptionTask _transcriptionTask;
+        private readonly LocalTranscriptionTask _transcriptionTask;
         // nope private readonly GenerateVTTFileTask _generateVTTFileTask;
         private readonly ProcessVideoTask _processVideoTask;
         private readonly SceneDetectionTask _sceneDetectionTask;
@@ -39,7 +39,7 @@ public QueueAwakerTask() { }
 
         public QueueAwakerTask(RabbitMQConnection rabbitMQ, DownloadPlaylistInfoTask downloadPlaylistInfoTask,
             DownloadMediaTask downloadMediaTask,
-            TranscriptionTask transcriptionTask, ProcessVideoTask processVideoTask,
+            LocalTranscriptionTask transcriptionTask, ProcessVideoTask processVideoTask,
             // GenerateVTTFileTask generateVTTFileTask, 
             SceneDetectionTask sceneDetectionTask,
             CreateBoxTokenTask createBoxTokenTask,// UpdateBoxTokenTask updateBoxTokenTask,
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"text": " Hello? Hello? Hello?", "segments": [{"id": 0, "seek": 0, "start": 0.0, "end": 3.0, "text": " Hello? Hello? Hello?", "tokens": [50363, 18435, 30, 18435, 30, 18435, 30, 50513], "temperature": 0.0, "avg_logprob": -0.636968559688992, "compression_ratio": 1.1764705882352942, "no_speech_prob": 0.22877301275730133}], "language": "en"}