This repository has been archived by the owner on Oct 17, 2023. It is now read-only.
forked from purdy/aws-transcribe-transcript
-
Notifications
You must be signed in to change notification settings - Fork 37
/
Copy pathtranscript.py
47 lines (44 loc) · 1.43 KB
/
transcript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/usr/bin/env python3
def main():
import sys
import json
import datetime
import codecs
filename=sys.argv[1]
print ("Filename: ", filename)
with codecs.open(filename+'.txt', 'w', 'utf-8') as w:
with codecs.open(filename, 'r', 'utf-8') as f:
data=json.loads(f.read())
labels = data['results']['speaker_labels']['segments']
speaker_start_times={}
for label in labels:
for item in label['items']:
speaker_start_times[item['start_time']] =item['speaker_label']
items = data['results']['items']
lines=[]
line=''
time=0
speaker='null'
i=0
for item in items:
i=i+1
content = item['alternatives'][0]['content']
if item.get('start_time'):
current_speaker=speaker_start_times[item['start_time']]
elif item['type'] == 'punctuation':
line = line+content
if current_speaker != speaker:
if speaker:
lines.append({'speaker':speaker, 'line':line, 'time':time})
line=content
speaker=current_speaker
time=item['start_time']
elif item['type'] != 'punctuation':
line = line + ' ' + content
lines.append({'speaker':speaker, 'line':line,'time':time})
sorted_lines = sorted(lines,key=lambda k: float(k['time']))
for line_data in sorted_lines:
line='[' + str(datetime.timedelta(seconds=int(round(float(line_data['time']))))) + '] ' + line_data.get('speaker') + ': ' + line_data.get('line')
w.write(line + '\n\n')
if __name__ == '__main__':
main()