forked from cubrink/srt-parse
-
Notifications
You must be signed in to change notification settings - Fork 2
/
srt-parse.py
135 lines (112 loc) · 4.77 KB
/
srt-parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os
import sys
import srt
import argparse
from pydub import AudioSegment
def get_slice_indexes(sub):
'''
Returns the indexes used to slice the corresponding audio from the subtitle
Keyword arguments:
sub - a srt subtitle
'''
return int(sub.start.total_seconds() * 1000), int(sub.end.total_seconds() * 1000)
def build_parser():
'''
Creates an argparse parser
'''
parser = argparse.ArgumentParser(description='Segment .wav files according to a provided .srt closed caption file',
prog='srt-parse')
parser.add_argument('audio_input', type=str,
help='Location of .wav file to be processed')
parser.add_argument('srt_input', type=str,
help='Location of .srt file to be processed')
parser.add_argument('--output-dir', type=str,
help='Directory for processed files to be saved to',
default='.\\out\\')
parser.add_argument('--audio-out-file-pattern', type=str,
help='A python-style f-string for saving audio files',
default='{}-audio.wav')
parser.add_argument('--text-out-file-pattern', type=str,
help='A python-style f-string for saving text files',
default='{}-text.txt')
parser.add_argument('--output-type', type=str,
help='Output filetype',
choices=['txt', 'csv'],
default='csv')
parser.add_argument('--csv-seperator', type=str,
help='Character sequence used to seperate values in csv',
default=',')
parser.add_argument('--csv-filename', type=str,
help='Name of file to write as csv',
default='out.csv')
parser.add_argument('--update-increment', type=int,
help='Print progress after every specified amount of segments.',
default=25)
parser.add_argument('--in-encoding', type=str,
help='Encoding used to read the .srt file',
default='utf-8')
parser.add_argument('--out-encoding', type=str,
help='Encoding to use when writing text data to file',
default=None)
return parser
def write_txt():
'''
Write data in .txt format
'''
for idx, sub in enumerate(subs):
if idx % args.update_increment == 0:
print(f'Processing segment #{idx}')
start, end = get_slice_indexes(sub)
clip = audio[start:end]
clip.export(args.output_dir + args.audio_out_file_pattern.format(idx), format='wav')
with open(os.path.join(args.output_dir, args.text_out_file_pattern.format(idx)), 'w', encoding=args.out_encoding) as f:
f.write(sub.content.replace('\n', ' '))
def write_csv():
'''
Write data in .csv format
'''
with open(os.path.join(args.output_dir, args.csv_filename), 'w', encoding=args.out_encoding) as f:
for idx, sub in enumerate(subs):
if idx % args.update_increment == 0:
print(f'Processing segment #{idx}')
start, end = get_slice_indexes(sub)
clip = audio[start:end]
clip.export(os.path.join(args.output_dir, args.audio_out_file_pattern.format(idx)), format='wav')
f.write(args.csv_seperator.join([args.audio_out_file_pattern.format(idx), sub.content.replace('\n', ' ')]))
f.write('\n')
def get_write_function():
'''
Determine which method to use to write to file.
'''
write_map = {'txt': write_txt,
'csv': write_csv}
return write_map[args.output_type]
def get_subs():
'''
Returns a generator yielding parsed captions from the .srt
'''
with open(args.srt_input, 'r', encoding=args.in_encoding) as f:
str_sub = ''.join(f.readlines())
return (sub for sub in srt.parse(str_sub))
# Get parser and parse
parser = build_parser()
args = parser.parse_args()
# Error checking
if args.update_increment <= 0:
parser.error("Update increment must be a postive number")
# Check if output path exists and is a directory
try:
if not os.path.exists(args.output_dir):
os.mkdir(args.output_dir)
except FileExistsError:
parser.error(f"Output path {args.output_dir} is not a directory.\nsrt-parse will now exit.")
sys.exit()
# Open srt file
subs = get_subs()
# Open audio file
audio = AudioSegment.from_wav(args.audio_input)
# Determine which writing function to use
write = get_write_function()
# Process audio clips
write()
print('Processing finished!')