Merge pull request #489 from classtranscribe/UpdateForWhisper

Update for whisper
classtranscribe · Oct 23, 2024 · 042cc8d · 042cc8d
2 parents 6a78953 + 905ab4c
commit 042cc8d
Show file tree

Hide file tree

Showing 25 changed files with 1,573 additions and 64 deletions.
diff --git a/ClassTranscribeDatabase/CommonUtils.cs b/ClassTranscribeDatabase/CommonUtils.cs
@@ -23,7 +23,7 @@ public enum TaskType
             DownloadPlaylistInfo = 3,
             DownloadMedia = 4,
             ConvertMedia = 5,
-            TranscribeVideo = 6,
+            // TranscribeVideo = 6,
             ProcessVideo = 7,
             Aggregator = 8,
             GenerateVTTFile = 9,
@@ -39,7 +39,9 @@ public enum TaskType
             PythonCrawler = 19,
 
             DescribeVideo = 20,
-            DescribeImage = 21
+            DescribeImage = 21,
+            AzureTranscribeVideo = 22,
+            LocalTranscribeVideo = 23
 
         }
 

diff --git a/ClassTranscribeDatabase/global.json b/ClassTranscribeDatabase/global.json
@@ -1,5 +1,5 @@
 {
   "sdk": {
-    "version": "8.0.201"
+    "version": "8.0"
   }
 }
diff --git a/ClassTranscribeServer/Controllers/PlaylistsController.cs b/ClassTranscribeServer/Controllers/PlaylistsController.cs
@@ -170,7 +170,7 @@ public async Task<ActionResult<IEnumerable<PlaylistDTO>>> GetPlaylists2(string o
                     JsonMetadata = m.JsonMetadata,
                     CreatedAt = m.CreatedAt,
                     SceneDetectReady = m.Video.HasSceneObjectData(),
-                    Ready = m.Video != null && "NoError" == m.Video.TranscriptionStatus ,
+                    Ready = m.Video != null && Video.TranscriptionStatusMessages.NOERROR == m.Video.TranscriptionStatus ,
                     SourceType = m.SourceType,
                     Duration = m.Video?.Duration,
                     PublishStatus = m.PublishStatus,
@@ -265,7 +265,7 @@ public async Task<ActionResult<PlaylistDTO>> GetPlaylist(string id)
                     PublishStatus = m.PublishStatus,
                     Options = m.GetOptionsAsJson(),
                     SceneDetectReady = m.Video != null && m.Video.HasSceneObjectData(),
-                    Ready = m.Video != null && "NoError" == m.Video.TranscriptionStatus ,
+                    Ready = m.Video != null && Video.TranscriptionStatusMessages.NOERROR == m.Video.TranscriptionStatus ,
                     Video = m.Video == null ? null : new VideoDTO
                    {
                         Id = m.Video.Id,

diff --git a/ClassTranscribeServer/Utils/WakeDownloader.cs b/ClassTranscribeServer/Utils/WakeDownloader.cs
@@ -104,7 +104,7 @@ public void TranscribeVideo(string videoOrMediaId, bool deleteExisting)
         {
             JObject msg = new JObject
             {
-                { "Type", TaskType.TranscribeVideo.ToString() },
+                { "Type", TaskType.LocalTranscribeVideo.ToString() },
                 { "videoOrMediaId", videoOrMediaId },
                 { "DeleteExisting", deleteExisting }
             };

diff --git a/ClassTranscribeServer/global.json b/ClassTranscribeServer/global.json
@@ -1,5 +1,5 @@
 {
   "sdk": {
-    "version": "8.0.401"
+    "version": "8.0"
   }
 }
diff --git a/PythonRpcServer/.gitignore b/PythonRpcServer/.gitignore
@@ -0,0 +1 @@
+venv/
diff --git a/PythonRpcServer/requirements.txt b/PythonRpcServer/requirements.txt
@@ -32,8 +32,9 @@ wcwidth==0.2.13
 
 # Not versioned
 numpy
-pytube     # if not available, use the tar.gz package (see Dockerfile)
-
+# No longer maintained pytube     # if not available, use the tar.gz package (see Dockerfile)
+yt-dlp
+#Always get latest
 
 # protobuf version 3.18.3 causes  NotImplementedError("To be implemented") in PythonRpcServer/mediaprovider.py
 # Likely need to coordinate updating the C# version too

diff --git a/PythonRpcServer/server.py b/PythonRpcServer/server.py
@@ -12,6 +12,9 @@
 from echo import EchoProvider
 from kaltura import KalturaProvider
 from mediaprovider import InvalidPlaylistInfoException
+from transcribe import transcribe_audio
+
+import json
 import hasher 
 import ffmpeg
 # import phrasehinter
@@ -41,6 +44,18 @@ def LogWorker(logId, worker):
 
 
 class PythonServerServicer(ct_pb2_grpc.PythonServerServicer):
+    # Transcribe it into a json string from the transcribe text
+    # Make it returns a json string
+    # change name to TranscribeRPC
+    # def CaptionRPC(self, request, context):
+    #     #See CaptionRequest
+    #     print( f"CaptionRPC({request.logId};{request.refId};{request.filePath};{request.phraseHints};{request.courseHints};{request.outputLanguages})")
+    #     kalturaprovider = KalturaProvider()
+    #     result = LogWorker(f"CaptionRPC({request.filePath})", lambda: kalturaprovider.getCaptions(request.refId))
+    #     return  ct_pb2.JsonString(json = result)
+
+
+
     def GetScenesRPC(self, request, context):
         raise NotImplementedError('Implementation now in pyapi')
 #        res = scenedetector.find_scenes(request.filePath)
@@ -113,14 +128,31 @@ def ComputeFileHash(self, request, context):
     def GetMediaInfoRPC(self, request, context):
         result = LogWorker(f"GetMediaInfo({request.filePath})", lambda: ffmpeg.getMediaInfo(request.filePath))
         return  ct_pb2.JsonString(json = result)
+
+
+    def TranscribeAudioRPC(self, request, context):
+        print(f"TranscribeAudioRPC({request.logId};{request.filePath})")
+        try:
+            logging.info(f"Starting transcription for file: {request.filePath}")
+            transcription_result = LogWorker(
+                f"TranscribeAudioRPC({request.filePath})",
+                lambda: transcribe_audio(request.filePath, request.testing)
+            )
+            logging.info(f"Transcription completed successfully for: {request.filePath}")
+            return ct_pb2.JsonString(json=json.dumps(transcription_result))
+
+        except Exception as e:
+            context.set_code(grpc.StatusCode.INTERNAL)
+            context.set_details(f"Transcription failed: {str(e)}")
+            return ct_pb2.JsonString(json=json.dumps({"error": str(e)}))
 
 def serve():
     print("Python RPC Server Starting")
 
     # Until we can ensure no timeouts on remote services, the default here is set to a conservative low number
     # This is to ensure we can still make progress even if every python tasks tries to use all cpu cores.
     max_workers=int(os.getenv('NUM_PYTHON_WORKERS', 3))
-    print(f"max_workers={max_workers}")
+    print(f"max_workers={max_workers}. Starting up grpc server...")
 
     server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_workers))
 

diff --git a/PythonRpcServer/transcribe.py b/PythonRpcServer/transcribe.py
@@ -0,0 +1,126 @@
+import os
+import subprocess
+import json
+from time import perf_counter 
+from ffmpy import FFmpeg
+import utils
+
+# Path to the Whisper executable inside the container
+WHISPER_EXECUTABLE = os.environ.get('WHISPER_EXE','whisper')  # Executable 'main' is assumed to be in the same directory as this script
+MODEL = os.environ.get('WHISPER_MODEL','models/ggml-base.en.bin')
+
+def convert_video_to_wav(input_filepath, offset=None):
+    """
+    Converts a video file to WAV format using ffmpy.
+    """
+    try:
+        start_time = perf_counter()
+        if offset is None:
+            offset = 0.0
+
+        nthreads = utils.getMaxThreads()
+
+        print(f"Converting video '{input_filepath}' to WAV with offset {offset} using {nthreads} thread(s).")
+        output_filepath = utils.getTmpFile()
+        ext = '.wav'
+
+        ff = FFmpeg(
+            global_options=f"-hide_banner -loglevel error -nostats -threads {nthreads}",
+            inputs={input_filepath: f'-ss {offset}'},
+            outputs={output_filepath: '-c:a pcm_s16le -ac 1 -y -ar 16000 -f wav'}
+        )
+        print(f"Starting conversion. Audio output will be saved in {output_filepath}")
+        ff.run()
+        end_time = perf_counter()
+        print(f"Conversion complete. Duration: {int(end_time - start_time)} seconds")
+        return output_filepath, ext
+    except Exception as e:
+        print("Exception during conversion:" + str(e))
+        raise e
+
+def transcribe_audio(media_filepath, testing=False):
+    if testing:
+        json_output_path = f"/PythonRpcServer/transcribe_hellohellohello.wav.json"
+        with open(json_output_path, 'r') as json_file:
+            transcription_result = json.load(json_file)
+
+        # Print the transcription result (testing purpose)
+        print("Transcription result:")
+        print(json.dumps(transcription_result, indent=4))
+
+        return transcription_result
+
+    if media_filepath == 'TEST-transcribe_example_result':
+        result_json_file = 'transcribe_exampleffmp_result.json'
+        with open(result_json_file, 'r') as json_file:
+            transcription_result = json.load(json_file)
+        return transcription_result
+
+    # Ensure the media file exists
+    if not os.path.exists(media_filepath):
+        raise FileNotFoundError(f"Media file not found: {media_filepath}")
+
+    # convert video to wav if needed
+    wav_created = False  # Track if WAV was created
+    if not media_filepath.endswith('.wav'):
+        media_filepath, _ = convert_video_to_wav(media_filepath)
+        wav_created = True  # WAV file was created
+
+
+    # Path to the output JSON file that Whisper will generate
+    json_output_path = f"{media_filepath}.json"
+    if os.path.exists(json_output_path):
+        os.remove(json_output_path)
+
+    # Command to run Whisper.cpp inside the container using the main executable
+    whisper_command = [
+        WHISPER_EXECUTABLE,                  # Path to Whisper executable
+        '-ojf',                              # Output as JSON file
+        '-f', media_filepath,                 # Media file path
+        '-m', MODEL
+    ]
+
+    print("Running Whisper transcription inside the container...")
+
+    # Execute the Whisper command
+    result = subprocess.run(whisper_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    # Handle command failure
+    if result.returncode != 0:
+        raise Exception(f"Whisper failed with error:\n{result.stderr.decode('utf-8')}")
+
+    # Check if the output JSON file was generated
+    print(f"Checking for JSON output at: {json_output_path}")
+    if not os.path.exists(json_output_path):
+        raise FileNotFoundError(f"Expected JSON output file not found: {json_output_path}")
+
+    # Load the JSON transcription result
+    with open(json_output_path, 'r') as json_file:
+        transcription_result = json.load(json_file)
+
+    # Print the transcription result (testing purpose)
+    print("Transcription result:")
+    print(json.dumps(transcription_result, indent=4))
+
+    # Delete the JSON file after reading it
+    os.remove(json_output_path)
+    print(f"Deleted the JSON file: {json_output_path}")
+
+    if wav_created:
+        try:
+            os.remove(media_filepath)
+            print(f"Deleted the WAV file: {media_filepath}")
+        except Exception as e:
+            print(f"Error deleting WAV file: {str(e)}")
+
+    return transcription_result
+
+# Example usage
+if __name__ == '__main__':
+    # Example media file path inside the container (the actual path will depend on where the file is located)
+    json_output_path = f"/PythonRpcServer/transcribe_hellohellohello.wav.json"
+    with open(json_output_path, 'r') as json_file:
+        transcription_result = json.load(json_file)
+
+    print("Transcription Result:", json.dumps(transcription_result, indent=4))
+