Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update for whisper #489

Merged
merged 24 commits into from
Oct 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions ClassTranscribeDatabase/CommonUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ public enum TaskType
DownloadPlaylistInfo = 3,
DownloadMedia = 4,
ConvertMedia = 5,
TranscribeVideo = 6,
// TranscribeVideo = 6,
ProcessVideo = 7,
Aggregator = 8,
GenerateVTTFile = 9,
Expand All @@ -39,7 +39,9 @@ public enum TaskType
PythonCrawler = 19,

DescribeVideo = 20,
DescribeImage = 21
DescribeImage = 21,
AzureTranscribeVideo = 22,
LocalTranscribeVideo = 23

}

Expand Down
2 changes: 1 addition & 1 deletion ClassTranscribeDatabase/global.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"sdk": {
"version": "8.0.201"
"version": "8.0"
}
}
4 changes: 2 additions & 2 deletions ClassTranscribeServer/Controllers/PlaylistsController.cs
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ public async Task<ActionResult<IEnumerable<PlaylistDTO>>> GetPlaylists2(string o
JsonMetadata = m.JsonMetadata,
CreatedAt = m.CreatedAt,
SceneDetectReady = m.Video.HasSceneObjectData(),
Ready = m.Video != null && "NoError" == m.Video.TranscriptionStatus ,
Ready = m.Video != null && Video.TranscriptionStatusMessages.NOERROR == m.Video.TranscriptionStatus ,
SourceType = m.SourceType,
Duration = m.Video?.Duration,
PublishStatus = m.PublishStatus,
Expand Down Expand Up @@ -265,7 +265,7 @@ public async Task<ActionResult<PlaylistDTO>> GetPlaylist(string id)
PublishStatus = m.PublishStatus,
Options = m.GetOptionsAsJson(),
SceneDetectReady = m.Video != null && m.Video.HasSceneObjectData(),
Ready = m.Video != null && "NoError" == m.Video.TranscriptionStatus ,
Ready = m.Video != null && Video.TranscriptionStatusMessages.NOERROR == m.Video.TranscriptionStatus ,
Video = m.Video == null ? null : new VideoDTO
{
Id = m.Video.Id,
Expand Down
2 changes: 1 addition & 1 deletion ClassTranscribeServer/Utils/WakeDownloader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ public void TranscribeVideo(string videoOrMediaId, bool deleteExisting)
{
JObject msg = new JObject
{
{ "Type", TaskType.TranscribeVideo.ToString() },
{ "Type", TaskType.LocalTranscribeVideo.ToString() },
{ "videoOrMediaId", videoOrMediaId },
{ "DeleteExisting", deleteExisting }
};
Expand Down
2 changes: 1 addition & 1 deletion ClassTranscribeServer/global.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"sdk": {
"version": "8.0.401"
"version": "8.0"
}
}
1 change: 1 addition & 0 deletions PythonRpcServer/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
venv/
5 changes: 3 additions & 2 deletions PythonRpcServer/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ wcwidth==0.2.13

# Not versioned
numpy
pytube # if not available, use the tar.gz package (see Dockerfile)

# No longer maintained pytube # if not available, use the tar.gz package (see Dockerfile)
yt-dlp
#Always get latest

# protobuf version 3.18.3 causes NotImplementedError("To be implemented") in PythonRpcServer/mediaprovider.py
# Likely need to coordinate updating the C# version too
Expand Down
34 changes: 33 additions & 1 deletion PythonRpcServer/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
from echo import EchoProvider
from kaltura import KalturaProvider
from mediaprovider import InvalidPlaylistInfoException
from transcribe import transcribe_audio

import json
import hasher
import ffmpeg
# import phrasehinter
Expand Down Expand Up @@ -41,6 +44,18 @@ def LogWorker(logId, worker):


class PythonServerServicer(ct_pb2_grpc.PythonServerServicer):
# Transcribe it into a json string from the transcribe text
# Make it returns a json string
# change name to TranscribeRPC
# def CaptionRPC(self, request, context):
# #See CaptionRequest
# print( f"CaptionRPC({request.logId};{request.refId};{request.filePath};{request.phraseHints};{request.courseHints};{request.outputLanguages})")
# kalturaprovider = KalturaProvider()
# result = LogWorker(f"CaptionRPC({request.filePath})", lambda: kalturaprovider.getCaptions(request.refId))
# return ct_pb2.JsonString(json = result)



def GetScenesRPC(self, request, context):
raise NotImplementedError('Implementation now in pyapi')
# res = scenedetector.find_scenes(request.filePath)
Expand Down Expand Up @@ -113,14 +128,31 @@ def ComputeFileHash(self, request, context):
def GetMediaInfoRPC(self, request, context):
result = LogWorker(f"GetMediaInfo({request.filePath})", lambda: ffmpeg.getMediaInfo(request.filePath))
return ct_pb2.JsonString(json = result)


def TranscribeAudioRPC(self, request, context):
print(f"TranscribeAudioRPC({request.logId};{request.filePath})")
try:
logging.info(f"Starting transcription for file: {request.filePath}")
transcription_result = LogWorker(
f"TranscribeAudioRPC({request.filePath})",
lambda: transcribe_audio(request.filePath, request.testing)
)
logging.info(f"Transcription completed successfully for: {request.filePath}")
return ct_pb2.JsonString(json=json.dumps(transcription_result))

except Exception as e:
context.set_code(grpc.StatusCode.INTERNAL)
context.set_details(f"Transcription failed: {str(e)}")
return ct_pb2.JsonString(json=json.dumps({"error": str(e)}))

def serve():
print("Python RPC Server Starting")

# Until we can ensure no timeouts on remote services, the default here is set to a conservative low number
# This is to ensure we can still make progress even if every python tasks tries to use all cpu cores.
max_workers=int(os.getenv('NUM_PYTHON_WORKERS', 3))
print(f"max_workers={max_workers}")
print(f"max_workers={max_workers}. Starting up grpc server...")

server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_workers))

Expand Down
126 changes: 126 additions & 0 deletions PythonRpcServer/transcribe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import os
import subprocess
import json
from time import perf_counter
from ffmpy import FFmpeg
import utils

# Path to the Whisper executable inside the container
WHISPER_EXECUTABLE = os.environ.get('WHISPER_EXE','whisper') # Executable 'main' is assumed to be in the same directory as this script
MODEL = os.environ.get('WHISPER_MODEL','models/ggml-base.en.bin')

def convert_video_to_wav(input_filepath, offset=None):
"""
Converts a video file to WAV format using ffmpy.
"""
try:
start_time = perf_counter()
if offset is None:
offset = 0.0

nthreads = utils.getMaxThreads()

print(f"Converting video '{input_filepath}' to WAV with offset {offset} using {nthreads} thread(s).")
output_filepath = utils.getTmpFile()
ext = '.wav'

ff = FFmpeg(
global_options=f"-hide_banner -loglevel error -nostats -threads {nthreads}",
inputs={input_filepath: f'-ss {offset}'},
outputs={output_filepath: '-c:a pcm_s16le -ac 1 -y -ar 16000 -f wav'}
)
print(f"Starting conversion. Audio output will be saved in {output_filepath}")
ff.run()
end_time = perf_counter()
print(f"Conversion complete. Duration: {int(end_time - start_time)} seconds")
return output_filepath, ext
except Exception as e:
print("Exception during conversion:" + str(e))
raise e

def transcribe_audio(media_filepath, testing=False):
if testing:
json_output_path = f"/PythonRpcServer/transcribe_hellohellohello.wav.json"
with open(json_output_path, 'r') as json_file:
transcription_result = json.load(json_file)

# Print the transcription result (testing purpose)
print("Transcription result:")
print(json.dumps(transcription_result, indent=4))

return transcription_result

if media_filepath == 'TEST-transcribe_example_result':
result_json_file = 'transcribe_exampleffmp_result.json'
with open(result_json_file, 'r') as json_file:
transcription_result = json.load(json_file)
return transcription_result

# Ensure the media file exists
if not os.path.exists(media_filepath):
raise FileNotFoundError(f"Media file not found: {media_filepath}")

# convert video to wav if needed
wav_created = False # Track if WAV was created
if not media_filepath.endswith('.wav'):
media_filepath, _ = convert_video_to_wav(media_filepath)
wav_created = True # WAV file was created


# Path to the output JSON file that Whisper will generate
json_output_path = f"{media_filepath}.json"
if os.path.exists(json_output_path):
os.remove(json_output_path)

# Command to run Whisper.cpp inside the container using the main executable
whisper_command = [
WHISPER_EXECUTABLE, # Path to Whisper executable
'-ojf', # Output as JSON file
'-f', media_filepath, # Media file path
'-m', MODEL
]

print("Running Whisper transcription inside the container...")

# Execute the Whisper command
result = subprocess.run(whisper_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Handle command failure
if result.returncode != 0:
raise Exception(f"Whisper failed with error:\n{result.stderr.decode('utf-8')}")

# Check if the output JSON file was generated
print(f"Checking for JSON output at: {json_output_path}")
if not os.path.exists(json_output_path):
raise FileNotFoundError(f"Expected JSON output file not found: {json_output_path}")

# Load the JSON transcription result
with open(json_output_path, 'r') as json_file:
transcription_result = json.load(json_file)

# Print the transcription result (testing purpose)
print("Transcription result:")
print(json.dumps(transcription_result, indent=4))

# Delete the JSON file after reading it
os.remove(json_output_path)
print(f"Deleted the JSON file: {json_output_path}")

if wav_created:
try:
os.remove(media_filepath)
print(f"Deleted the WAV file: {media_filepath}")
except Exception as e:
print(f"Error deleting WAV file: {str(e)}")

return transcription_result

# Example usage
if __name__ == '__main__':
# Example media file path inside the container (the actual path will depend on where the file is located)
json_output_path = f"/PythonRpcServer/transcribe_hellohellohello.wav.json"
with open(json_output_path, 'r') as json_file:
transcription_result = json.load(json_file)

print("Transcription Result:", json.dumps(transcription_result, indent=4))

Loading
Loading