Skip to content

Commit

Permalink
Merge pull request #489 from classtranscribe/UpdateForWhisper
Browse files Browse the repository at this point in the history
Update for whisper
  • Loading branch information
angrave authored Oct 23, 2024
2 parents 6a78953 + 905ab4c commit 042cc8d
Show file tree
Hide file tree
Showing 25 changed files with 1,573 additions and 64 deletions.
6 changes: 4 additions & 2 deletions ClassTranscribeDatabase/CommonUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ public enum TaskType
DownloadPlaylistInfo = 3,
DownloadMedia = 4,
ConvertMedia = 5,
TranscribeVideo = 6,
// TranscribeVideo = 6,
ProcessVideo = 7,
Aggregator = 8,
GenerateVTTFile = 9,
Expand All @@ -39,7 +39,9 @@ public enum TaskType
PythonCrawler = 19,

DescribeVideo = 20,
DescribeImage = 21
DescribeImage = 21,
AzureTranscribeVideo = 22,
LocalTranscribeVideo = 23

}

Expand Down
2 changes: 1 addition & 1 deletion ClassTranscribeDatabase/global.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"sdk": {
"version": "8.0.201"
"version": "8.0"
}
}
4 changes: 2 additions & 2 deletions ClassTranscribeServer/Controllers/PlaylistsController.cs
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ public async Task<ActionResult<IEnumerable<PlaylistDTO>>> GetPlaylists2(string o
JsonMetadata = m.JsonMetadata,
CreatedAt = m.CreatedAt,
SceneDetectReady = m.Video.HasSceneObjectData(),
Ready = m.Video != null && "NoError" == m.Video.TranscriptionStatus ,
Ready = m.Video != null && Video.TranscriptionStatusMessages.NOERROR == m.Video.TranscriptionStatus ,
SourceType = m.SourceType,
Duration = m.Video?.Duration,
PublishStatus = m.PublishStatus,
Expand Down Expand Up @@ -265,7 +265,7 @@ public async Task<ActionResult<PlaylistDTO>> GetPlaylist(string id)
PublishStatus = m.PublishStatus,
Options = m.GetOptionsAsJson(),
SceneDetectReady = m.Video != null && m.Video.HasSceneObjectData(),
Ready = m.Video != null && "NoError" == m.Video.TranscriptionStatus ,
Ready = m.Video != null && Video.TranscriptionStatusMessages.NOERROR == m.Video.TranscriptionStatus ,
Video = m.Video == null ? null : new VideoDTO
{
Id = m.Video.Id,
Expand Down
2 changes: 1 addition & 1 deletion ClassTranscribeServer/Utils/WakeDownloader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ public void TranscribeVideo(string videoOrMediaId, bool deleteExisting)
{
JObject msg = new JObject
{
{ "Type", TaskType.TranscribeVideo.ToString() },
{ "Type", TaskType.LocalTranscribeVideo.ToString() },
{ "videoOrMediaId", videoOrMediaId },
{ "DeleteExisting", deleteExisting }
};
Expand Down
2 changes: 1 addition & 1 deletion ClassTranscribeServer/global.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"sdk": {
"version": "8.0.401"
"version": "8.0"
}
}
1 change: 1 addition & 0 deletions PythonRpcServer/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
venv/
5 changes: 3 additions & 2 deletions PythonRpcServer/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ wcwidth==0.2.13

# Not versioned
numpy
pytube # if not available, use the tar.gz package (see Dockerfile)

# No longer maintained pytube # if not available, use the tar.gz package (see Dockerfile)
yt-dlp
#Always get latest

# protobuf version 3.18.3 causes NotImplementedError("To be implemented") in PythonRpcServer/mediaprovider.py
# Likely need to coordinate updating the C# version too
Expand Down
34 changes: 33 additions & 1 deletion PythonRpcServer/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
from echo import EchoProvider
from kaltura import KalturaProvider
from mediaprovider import InvalidPlaylistInfoException
from transcribe import transcribe_audio

import json
import hasher
import ffmpeg
# import phrasehinter
Expand Down Expand Up @@ -41,6 +44,18 @@ def LogWorker(logId, worker):


class PythonServerServicer(ct_pb2_grpc.PythonServerServicer):
# Transcribe it into a json string from the transcribe text
# Make it returns a json string
# change name to TranscribeRPC
# def CaptionRPC(self, request, context):
# #See CaptionRequest
# print( f"CaptionRPC({request.logId};{request.refId};{request.filePath};{request.phraseHints};{request.courseHints};{request.outputLanguages})")
# kalturaprovider = KalturaProvider()
# result = LogWorker(f"CaptionRPC({request.filePath})", lambda: kalturaprovider.getCaptions(request.refId))
# return ct_pb2.JsonString(json = result)



def GetScenesRPC(self, request, context):
raise NotImplementedError('Implementation now in pyapi')
# res = scenedetector.find_scenes(request.filePath)
Expand Down Expand Up @@ -113,14 +128,31 @@ def ComputeFileHash(self, request, context):
def GetMediaInfoRPC(self, request, context):
result = LogWorker(f"GetMediaInfo({request.filePath})", lambda: ffmpeg.getMediaInfo(request.filePath))
return ct_pb2.JsonString(json = result)


def TranscribeAudioRPC(self, request, context):
print(f"TranscribeAudioRPC({request.logId};{request.filePath})")
try:
logging.info(f"Starting transcription for file: {request.filePath}")
transcription_result = LogWorker(
f"TranscribeAudioRPC({request.filePath})",
lambda: transcribe_audio(request.filePath, request.testing)
)
logging.info(f"Transcription completed successfully for: {request.filePath}")
return ct_pb2.JsonString(json=json.dumps(transcription_result))

except Exception as e:
context.set_code(grpc.StatusCode.INTERNAL)
context.set_details(f"Transcription failed: {str(e)}")
return ct_pb2.JsonString(json=json.dumps({"error": str(e)}))

def serve():
print("Python RPC Server Starting")

# Until we can ensure no timeouts on remote services, the default here is set to a conservative low number
# This is to ensure we can still make progress even if every python tasks tries to use all cpu cores.
max_workers=int(os.getenv('NUM_PYTHON_WORKERS', 3))
print(f"max_workers={max_workers}")
print(f"max_workers={max_workers}. Starting up grpc server...")

server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_workers))

Expand Down
126 changes: 126 additions & 0 deletions PythonRpcServer/transcribe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import os
import subprocess
import json
from time import perf_counter
from ffmpy import FFmpeg
import utils

# Path to the Whisper executable inside the container
WHISPER_EXECUTABLE = os.environ.get('WHISPER_EXE','whisper') # Executable 'main' is assumed to be in the same directory as this script
MODEL = os.environ.get('WHISPER_MODEL','models/ggml-base.en.bin')

def convert_video_to_wav(input_filepath, offset=None):
"""
Converts a video file to WAV format using ffmpy.
"""
try:
start_time = perf_counter()
if offset is None:
offset = 0.0

nthreads = utils.getMaxThreads()

print(f"Converting video '{input_filepath}' to WAV with offset {offset} using {nthreads} thread(s).")
output_filepath = utils.getTmpFile()
ext = '.wav'

ff = FFmpeg(
global_options=f"-hide_banner -loglevel error -nostats -threads {nthreads}",
inputs={input_filepath: f'-ss {offset}'},
outputs={output_filepath: '-c:a pcm_s16le -ac 1 -y -ar 16000 -f wav'}
)
print(f"Starting conversion. Audio output will be saved in {output_filepath}")
ff.run()
end_time = perf_counter()
print(f"Conversion complete. Duration: {int(end_time - start_time)} seconds")
return output_filepath, ext
except Exception as e:
print("Exception during conversion:" + str(e))
raise e

def transcribe_audio(media_filepath, testing=False):
if testing:
json_output_path = f"/PythonRpcServer/transcribe_hellohellohello.wav.json"
with open(json_output_path, 'r') as json_file:
transcription_result = json.load(json_file)

# Print the transcription result (testing purpose)
print("Transcription result:")
print(json.dumps(transcription_result, indent=4))

return transcription_result

if media_filepath == 'TEST-transcribe_example_result':
result_json_file = 'transcribe_exampleffmp_result.json'
with open(result_json_file, 'r') as json_file:
transcription_result = json.load(json_file)
return transcription_result

# Ensure the media file exists
if not os.path.exists(media_filepath):
raise FileNotFoundError(f"Media file not found: {media_filepath}")

# convert video to wav if needed
wav_created = False # Track if WAV was created
if not media_filepath.endswith('.wav'):
media_filepath, _ = convert_video_to_wav(media_filepath)
wav_created = True # WAV file was created


# Path to the output JSON file that Whisper will generate
json_output_path = f"{media_filepath}.json"
if os.path.exists(json_output_path):
os.remove(json_output_path)

# Command to run Whisper.cpp inside the container using the main executable
whisper_command = [
WHISPER_EXECUTABLE, # Path to Whisper executable
'-ojf', # Output as JSON file
'-f', media_filepath, # Media file path
'-m', MODEL
]

print("Running Whisper transcription inside the container...")

# Execute the Whisper command
result = subprocess.run(whisper_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Handle command failure
if result.returncode != 0:
raise Exception(f"Whisper failed with error:\n{result.stderr.decode('utf-8')}")

# Check if the output JSON file was generated
print(f"Checking for JSON output at: {json_output_path}")
if not os.path.exists(json_output_path):
raise FileNotFoundError(f"Expected JSON output file not found: {json_output_path}")

# Load the JSON transcription result
with open(json_output_path, 'r') as json_file:
transcription_result = json.load(json_file)

# Print the transcription result (testing purpose)
print("Transcription result:")
print(json.dumps(transcription_result, indent=4))

# Delete the JSON file after reading it
os.remove(json_output_path)
print(f"Deleted the JSON file: {json_output_path}")

if wav_created:
try:
os.remove(media_filepath)
print(f"Deleted the WAV file: {media_filepath}")
except Exception as e:
print(f"Error deleting WAV file: {str(e)}")

return transcription_result

# Example usage
if __name__ == '__main__':
# Example media file path inside the container (the actual path will depend on where the file is located)
json_output_path = f"/PythonRpcServer/transcribe_hellohellohello.wav.json"
with open(json_output_path, 'r') as json_file:
transcription_result = json.load(json_file)

print("Transcription Result:", json.dumps(transcription_result, indent=4))

Loading

0 comments on commit 042cc8d

Please sign in to comment.